18 using System.Collections.Generic;
21 using System.Text.RegularExpressions;
22 using TraceLabSDK.Types;
24 namespace TraceLab.Components.DevelopmentKit.Preprocessors
38 public static TLArtifactsCollection
ProcessArtifacts(TLArtifactsCollection listOfArtifacts,
int minWordLength,
bool removeNumbers)
40 TLArtifactsCollection processed =
new TLArtifactsCollection();
41 foreach (TLArtifact artifact
in listOfArtifacts.Values)
43 TLArtifact processedArtifact =
new TLArtifact(artifact.Id, String.Empty);
44 processedArtifact.Text = ProcessText(artifact.Text, minWordLength, removeNumbers);
45 processed.Add(processedArtifact);
57 public static string ProcessText(
string text,
int minWordLength,
bool removeNumbers)
59 StringBuilder builder =
new StringBuilder();
61 string[] tokens = text.Split();
63 foreach (
string token
in tokens)
65 if (token.Length >= minWordLength)
67 if (removeNumbers && IsNumber(token))
73 builder.AppendFormat(
"{0} ", token);
78 return builder.ToString().TrimEnd();
82 private static bool IsNumber(
string text)
84 int len = text.Length;
85 for (
int i = 0; i < len; ++i)
88 if (c < '0' || c >
'9')