TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
WeightUtil.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.Linq;
20 using System.Text;
21 using TraceLabSDK.Types;
22 
23 namespace TraceLab.Components.DevelopmentKit.Utils.TermDocumentMatrixUtils
24 {
28  public static class WeightUtil
29  {
36  {
37  for (int i = 0; i < matrix.NumDocs; i++)
38  {
39  for (int j = 0; j < matrix.NumTerms; j++)
40  {
41  matrix[i, j] = (matrix[i, j] > 0.0) ? 1.0 : 0.0;
42  }
43  }
44  return matrix;
45  }
46 
54  {
55  for (int i = 0; i < matrix.NumDocs; i++)
56  {
57  double max = matrix.GetDocument(i).Max();
58  for (int j = 0; j < matrix.NumTerms; j++)
59  {
60  matrix[i, j] = matrix[i, j] / max;
61  }
62  }
63  return matrix;
64  }
65 
71  public static double[] ComputeDF(TermDocumentMatrix matrix)
72  {
73  double[] df = new double[matrix.NumTerms];
74  for (int j = 0; j < matrix.NumTerms; j++)
75  {
76  df[j] = 0.0;
77  for (int i = 0; i < matrix.NumDocs; i++)
78  {
79  df[j] += (matrix[i, j] > 0.0) ? 1.0 : 0.0;
80  }
81  }
82  return df;
83  }
84 
90  public static double[] ComputeIDF(TermDocumentMatrix matrix)
91  {
92  return ComputeIDF(ComputeDF(matrix), matrix.NumDocs);
93  }
94 
101  public static double[] ComputeIDF(double[] df, int numDocs)
102  {
103  double[] idf = new double[df.Length];
104  for (int i = 0; i < df.Length; i++)
105  {
106  if (df[i] <= 0.0)
107  {
108  idf[i] = 0.0;
109  }
110  else
111  {
112  idf[i] = Math.Log(numDocs / df[i]);
113  }
114  }
115  return idf;
116  }
117 
124  {
125  return ComputeTFIDF(ComputeTF(matrix), ComputeIDF(matrix));
126  }
127 
134  public static TermDocumentMatrix ComputeTFIDF(TermDocumentMatrix tf, double[] idf)
135  {
136  for (int i = 0; i < tf.NumDocs; i++)
137  {
138  for (int j = 0; j < tf.NumTerms; j++)
139  {
140  tf[i, j] = tf[i, j] * idf[j];
141  }
142  }
143  return tf;
144  }
145 
151  public static double[] ComputeAverageVector(TermDocumentMatrix matrix)
152  {
153  double[] avg = new double[matrix.NumTerms];
154  for (int j = 0; j < matrix.NumTerms; j++)
155  {
156  for (int i = 0; i < matrix.NumDocs; i++)
157  {
158  avg[j] += matrix[i, j];
159  }
160  avg[j] = avg[j] / matrix.NumDocs;
161  }
162  return avg;
163  }
164 
171  public static double[] ComputeAverageVector(TermDocumentMatrix matrix, IEnumerable<string> IDs)
172  {
173  double[] avg = new double[matrix.NumTerms];
174  for (int j = 0; j < matrix.NumTerms; j++)
175  {
176  foreach (string docID in IDs)
177  {
178  int docIndex = matrix.GetDocumentIndex(docID);
179  avg[j] += matrix[docIndex, j];
180  }
181  avg[j] = avg[j] / IDs.Count();
182  }
183  return avg;
184  }
185  }
186 }