TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
SimpleStopwordsRemover.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.Linq;
20 using System.Text;
21 using System.Text.RegularExpressions;
22 using TraceLabSDK.Types;
23 
24 namespace TraceLab.Components.DevelopmentKit.Preprocessors
25 {
29  public static class SimpleStopwordsRemover
30  {
38  public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, int minWordLength, bool removeNumbers)
39  {
40  TLArtifactsCollection processed = new TLArtifactsCollection();
41  foreach (TLArtifact artifact in listOfArtifacts.Values)
42  {
43  TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
44  processedArtifact.Text = ProcessText(artifact.Text, minWordLength, removeNumbers);
45  processed.Add(processedArtifact);
46  }
47  return processed;
48  }
49 
57  public static string ProcessText(string text, int minWordLength, bool removeNumbers)
58  {
59  StringBuilder builder = new StringBuilder();
60 
61  string[] tokens = text.Split();
62 
63  foreach (string token in tokens)
64  {
65  if (token.Length >= minWordLength)
66  {
67  if (removeNumbers && IsNumber(token))
68  {
69  continue;
70  }
71  else
72  {
73  builder.AppendFormat("{0} ", token);
74  }
75  }
76  }
77 
78  return builder.ToString().TrimEnd();
79 
80  }
81 
82  private static bool IsNumber(string text)
83  {
84  int len = text.Length;
85  for (int i = 0; i < len; ++i)
86  {
87  char c = text[i];
88  if (c < '0' || c > '9')
89  return false;
90  }
91  return true;
92  }
93  }
94 }