TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
PorterStemmer.cs
Go to the documentation of this file.
1 // Porter stemmer in CSharp, based on the Java port. The original paper is in
2 //
3 // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
4 // no. 3, pp 130-137,
5 //
6 // See also http://www.tartarus.org/~martin/PorterStemmer
7 //
8 // History:
9 //
10 // Release 1
11 //
12 // Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
13 // The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
14 // is then out outside the bounds of b.
15 //
16 // Release 2
17 //
18 // Similarly,
19 //
20 // Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
21 // 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
22 // b[j] is then outside the bounds of b.
23 //
24 // Release 3
25 //
26 // Considerably revised 4/9/00 in the light of many helpful suggestions
27 // from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
28 //
29 // Release 4
30 //
31 // This revision allows the Porter Stemmer Algorithm to be exported via the
32 // .NET Framework. To facilate its use via .NET, the following commands need to be
33 // issued to the operating system to register the component so that it can be
34 // imported into .Net compatible languages, such as Delphi.NET, Visual Basic.NET,
35 // Visual C++.NET, etc.
36 //
37 // 1. Create a stong name:
38 // sn -k Keyfile.snk
39 // 2. Compile the C# class, which creates an assembly PorterStemmerAlgorithm.dll
40 // csc /t:library PorterStemmerAlgorithm.cs
41 // 3. Register the dll with the Windows Registry
42 // and so expose the interface to COM Clients via the type library
43 // ( PorterStemmerAlgorithm.tlb will be created)
44 // regasm /tlb PorterStemmerAlgorithm.dll
45 // 4. Load the component in the Global Assembly Cache
46 // gacutil -i PorterStemmerAlgorithm.dll
47 //
48 // Note: You must have the .Net Studio installed.
49 //
50 // Once this process is performed you should be able to import the class
51 // via the appropiate mechanism in the language that you are using.
52 //
53 // i.e in Delphi 7 .NET this is simply a matter of selecting:
54 // Project | Import Type Libary
55 // And then selecting Porter stemmer in CSharp Version 1.4"!
56 //
57 // Cheers Leif
58 
59 using System;
60 
61 namespace TraceLab.Components.DevelopmentKit.Preprocessors.Stemmers.Porter
62 {
66  public class PorterStemmer
67  {
68  private char[] b;
69  private int i, /* offset into b */
70  i_end, /* offset to end of stemmed word */
71  j, k;
72  private static int INC = 200;
73  /* unit of size whereby b is increased */
74 
78  public PorterStemmer()
79  {
80  b = new char[INC];
81  i = 0;
82  i_end = 0;
83  }
84 
90  public string stemTerm(string s)
91  {
92  setTerm(s);
93  stem();
94  return getTerm();
95  }
96 
108  void setTerm(string s)
109  {
110  i = s.Length;
111  char[] new_b = new char[i];
112  for (int c = 0; c < i; c++)
113  new_b[c] = s[c];
114 
115  b = new_b;
116 
117  }
118 
123  public string getTerm()
124  {
125  return new String(b, 0, i_end);
126  }
127 
133  public void add(char ch)
134  {
135  if (i == b.Length)
136  {
137  char[] new_b = new char[i + INC];
138  for (int c = 0; c < i; c++)
139  new_b[c] = b[c];
140  b = new_b;
141  }
142  b[i++] = ch;
143  }
144 
152  public void add(char[] w, int wLen)
153  {
154  if (i + wLen >= b.Length)
155  {
156  char[] new_b = new char[i + wLen + INC];
157  for (int c = 0; c < i; c++)
158  new_b[c] = b[c];
159  b = new_b;
160  }
161  for (int c = 0; c < wLen; c++)
162  b[i++] = w[c];
163  }
164 
170  public override string ToString()
171  {
172  return new String(b, 0, i_end);
173  }
174 
178  public int getResultLength()
179  {
180  return i_end;
181  }
182 
188  public char[] getResultBuffer()
189  {
190  return b;
191  }
192 
193  /* cons(i) is true <=> b[i] is a consonant. */
194  private bool cons(int i)
195  {
196  switch (b[i])
197  {
198  case 'a':
199  case 'e':
200  case 'i':
201  case 'o':
202  case 'u': return false;
203  case 'y': return (i == 0) ? true : !cons(i - 1);
204  default: return true;
205  }
206  }
207 
208  /* m() measures the number of consonant sequences between 0 and j. if c is
209  a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
210  presence,
211 
212  <c><v> gives 0
213  <c>vc<v> gives 1
214  <c>vcvc<v> gives 2
215  <c>vcvcvc<v> gives 3
216  ....
217  */
218  private int m()
219  {
220  int n = 0;
221  int i = 0;
222  while (true)
223  {
224  if (i > j) return n;
225  if (!cons(i)) break; i++;
226  }
227  i++;
228  while (true)
229  {
230  while (true)
231  {
232  if (i > j) return n;
233  if (cons(i)) break;
234  i++;
235  }
236  i++;
237  n++;
238  while (true)
239  {
240  if (i > j) return n;
241  if (!cons(i)) break;
242  i++;
243  }
244  i++;
245  }
246  }
247 
248  /* vowelinstem() is true <=> 0,...j contains a vowel */
249  private bool vowelinstem()
250  {
251  int i;
252  for (i = 0; i <= j; i++)
253  if (!cons(i))
254  return true;
255  return false;
256  }
257 
258  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
259  private bool doublec(int j)
260  {
261  if (j < 1)
262  return false;
263  if (b[j] != b[j - 1])
264  return false;
265  return cons(j);
266  }
267 
268  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
269  and also if the second c is not w,x or y. this is used when trying to
270  restore an e at the end of a short word. e.g.
271 
272  cav(e), lov(e), hop(e), crim(e), but
273  snow, box, tray.
274 
275  */
276  private bool cvc(int i)
277  {
278  if (i < 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
279  return false;
280  int ch = b[i];
281  if (ch == 'w' || ch == 'x' || ch == 'y')
282  return false;
283  return true;
284  }
285 
286  private bool ends(String s)
287  {
288  int l = s.Length;
289  int o = k - l + 1;
290  if (o < 0)
291  return false;
292  char[] sc = s.ToCharArray();
293  for (int i = 0; i < l; i++)
294  if (b[o + i] != sc[i])
295  return false;
296  j = k - l;
297  return true;
298  }
299 
300  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
301  k. */
302  private void setto(String s)
303  {
304  int l = s.Length;
305  int o = j + 1;
306  char[] sc = s.ToCharArray();
307  for (int i = 0; i < l; i++)
308  b[o + i] = sc[i];
309  k = j + l;
310  }
311 
312  /* r(s) is used further down. */
313  private void r(String s)
314  {
315  if (m() > 0)
316  setto(s);
317  }
318 
319  /* step1() gets rid of plurals and -ed or -ing. e.g.
320  caresses -> caress
321  ponies -> poni
322  ties -> ti
323  caress -> caress
324  cats -> cat
325 
326  feed -> feed
327  agreed -> agree
328  disabled -> disable
329 
330  matting -> mat
331  mating -> mate
332  meeting -> meet
333  milling -> mill
334  messing -> mess
335 
336  meetings -> meet
337  */
338  private void step1()
339  {
340  if (b[k] == 's')
341  {
342  if (ends("sses"))
343  k -= 2;
344  else if (ends("ies"))
345  setto("i");
346  else if (b[k - 1] != 's')
347  k--;
348  }
349  if (ends("eed"))
350  {
351  if (m() > 0)
352  k--;
353  }
354  else if ((ends("ed") || ends("ing")) && vowelinstem())
355  {
356  k = j;
357  if (ends("at"))
358  setto("ate");
359  else if (ends("bl"))
360  setto("ble");
361  else if (ends("iz"))
362  setto("ize");
363  else if (doublec(k))
364  {
365  k--;
366  int ch = b[k];
367  if (ch == 'l' || ch == 's' || ch == 'z')
368  k++;
369  }
370  else if (m() == 1 && cvc(k)) setto("e");
371  }
372  }
373 
374  /* step2() turns terminal y to i when there is another vowel in the stem. */
375  private void step2()
376  {
377  if (ends("y") && vowelinstem())
378  b[k] = 'i';
379  }
380 
381  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
382  -ation) maps to -ize etc. note that the string before the suffix must give
383  m() > 0. */
384  private void step3()
385  {
386  if (k == 0)
387  return;
388 
389  /* For Bug 1 */
390  switch (b[k - 1])
391  {
392  case 'a':
393  if (ends("ational")) { r("ate"); break; }
394  if (ends("tional")) { r("tion"); break; }
395  break;
396  case 'c':
397  if (ends("enci")) { r("ence"); break; }
398  if (ends("anci")) { r("ance"); break; }
399  break;
400  case 'e':
401  if (ends("izer")) { r("ize"); break; }
402  break;
403  case 'l':
404  if (ends("bli")) { r("ble"); break; }
405  if (ends("alli")) { r("al"); break; }
406  if (ends("entli")) { r("ent"); break; }
407  if (ends("eli")) { r("e"); break; }
408  if (ends("ousli")) { r("ous"); break; }
409  break;
410  case 'o':
411  if (ends("ization")) { r("ize"); break; }
412  if (ends("ation")) { r("ate"); break; }
413  if (ends("ator")) { r("ate"); break; }
414  break;
415  case 's':
416  if (ends("alism")) { r("al"); break; }
417  if (ends("iveness")) { r("ive"); break; }
418  if (ends("fulness")) { r("ful"); break; }
419  if (ends("ousness")) { r("ous"); break; }
420  break;
421  case 't':
422  if (ends("aliti")) { r("al"); break; }
423  if (ends("iviti")) { r("ive"); break; }
424  if (ends("biliti")) { r("ble"); break; }
425  break;
426  case 'g':
427  if (ends("logi")) { r("log"); break; }
428  break;
429  default:
430  break;
431  }
432  }
433 
434  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
435  private void step4()
436  {
437  switch (b[k])
438  {
439  case 'e':
440  if (ends("icate")) { r("ic"); break; }
441  if (ends("ative")) { r(""); break; }
442  if (ends("alize")) { r("al"); break; }
443  break;
444  case 'i':
445  if (ends("iciti")) { r("ic"); break; }
446  break;
447  case 'l':
448  if (ends("ical")) { r("ic"); break; }
449  if (ends("ful")) { r(""); break; }
450  break;
451  case 's':
452  if (ends("ness")) { r(""); break; }
453  break;
454  }
455  }
456 
457  /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
458  private void step5()
459  {
460  if (k == 0)
461  return;
462 
463  /* for Bug 1 */
464  switch (b[k - 1])
465  {
466  case 'a':
467  if (ends("al")) break; return;
468  case 'c':
469  if (ends("ance")) break;
470  if (ends("ence")) break; return;
471  case 'e':
472  if (ends("er")) break; return;
473  case 'i':
474  if (ends("ic")) break; return;
475  case 'l':
476  if (ends("able")) break;
477  if (ends("ible")) break; return;
478  case 'n':
479  if (ends("ant")) break;
480  if (ends("ement")) break;
481  if (ends("ment")) break;
482  /* element etc. not stripped before the m */
483  if (ends("ent")) break; return;
484  case 'o':
485  if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
486  /* j >= 0 fixes Bug 2 */
487  if (ends("ou")) break; return;
488  /* takes care of -ous */
489  case 's':
490  if (ends("ism")) break; return;
491  case 't':
492  if (ends("ate")) break;
493  if (ends("iti")) break; return;
494  case 'u':
495  if (ends("ous")) break; return;
496  case 'v':
497  if (ends("ive")) break; return;
498  case 'z':
499  if (ends("ize")) break; return;
500  default:
501  return;
502  }
503  if (m() > 1)
504  k = j;
505  }
506 
507  /* step6() removes a final -e if m() > 1. */
508  private void step6()
509  {
510  j = k;
511 
512  if (b[k] == 'e')
513  {
514  int a = m();
515  if (a > 1 || a == 1 && !cvc(k - 1))
516  k--;
517  }
518  if (b[k] == 'l' && doublec(k) && m() > 1)
519  k--;
520  }
521 
528  public void stem()
529  {
530  k = i - 1;
531  if (k > 1)
532  {
533  step1();
534  step2();
535  step3();
536  step4();
537  step5();
538  step6();
539  }
540  i_end = k + 1;
541  i = 0;
542  }
543  }
544 }