18 using System.Collections.Generic;
22 using TraceLabSDK.Types;
24 namespace TraceLab.Components.DevelopmentKit.Preprocessors
39 public static TLArtifactsCollection
ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords,
int minWordLength,
bool removeNumbers)
41 TLArtifactsCollection processed =
new TLArtifactsCollection();
42 foreach (TLArtifact artifact
in listOfArtifacts.Values)
44 TLArtifact processedArtifact =
new TLArtifact(artifact.Id, String.Empty);
45 processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
46 processed.Add(processedArtifact);
59 public static string ProcessText(
string textToProcess, TLStopwords stopwords,
int minWordLength,
bool removeNumbers)
61 StringBuilder builder =
new StringBuilder();
62 string result =
string.Empty;
63 string[] tokens = textToProcess.Split();
64 foreach (
string token
in tokens)
66 if (!stopwords.Contains(token) && token.Length >= minWordLength)
68 if (removeNumbers && IsNumber(token))
74 builder.AppendFormat(
"{0} ", token);
78 result = builder.ToString().Trim();
82 private static bool IsNumber(
string text)
84 int len = text.Length;
85 for (
int i = 0; i < len; ++i)
88 if (c < '0' || c >
'9')