TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
StopwordsRemover.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.Linq;
20 using System.Text;
21 using TraceLabSDK;
22 using TraceLabSDK.Types;
23 
24 namespace TraceLab.Components.DevelopmentKit.Preprocessors
25 {
29  public static class StopwordsRemover
30  {
39  public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers)
40  {
41  TLArtifactsCollection processed = new TLArtifactsCollection();
42  foreach (TLArtifact artifact in listOfArtifacts.Values)
43  {
44  TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
45  processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
46  processed.Add(processedArtifact);
47  }
48  return processed;
49  }
50 
59  public static string ProcessText(string textToProcess, TLStopwords stopwords, int minWordLength, bool removeNumbers)
60  {
61  StringBuilder builder = new StringBuilder();
62  string result = string.Empty;
63  string[] tokens = textToProcess.Split();
64  foreach (string token in tokens)
65  {
66  if (!stopwords.Contains(token) && token.Length >= minWordLength)
67  {
68  if (removeNumbers && IsNumber(token))
69  {
70  continue;
71  }
72  else
73  {
74  builder.AppendFormat("{0} ", token);
75  }
76  }
77  }
78  result = builder.ToString().Trim();
79  return result;
80  }
81 
82  private static bool IsNumber(string text)
83  {
84  int len = text.Length;
85  for (int i = 0; i < len; ++i)
86  {
87  char c = text[i];
88  if (c < '0' || c > '9')
89  return false;
90  }
91  return true;
92  }
93  }
94 }