TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
TermDocumentMatrix.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.IO;
20 using TraceLab.Components.DevelopmentKit.Properties;
21 using TraceLabSDK;
22 using TraceLabSDK.Types;
23 
24 namespace TraceLab.Components.DevelopmentKit
25 {
29  public class TermDocumentMatrix
30  {
31  #region Private members
32  private double[][] _matrix;
33  private List<string> _termIndex;
34  private List<string> _docIndex;
35  private Dictionary<string, int> _termIndexLookup;
36  private Dictionary<string, int> _docIndexLookup;
37  #endregion
38 
39  #region Public accessors
40 
47  public double this[int docindex, int termindex]
48  {
49  get
50  {
51  return _matrix[docindex][termindex];
52  }
53  set
54  {
55  _matrix[docindex][termindex] = value;
56  }
57  }
58 
64  public double[][] RawMatrix
65  {
66  get
67  {
68  return _matrix;
69  }
70  }
71 
75  public List<string> TermMap
76  {
77  get
78  {
79  return _termIndex;
80  }
81  }
82 
86  public List<string> DocMap
87  {
88  get
89  {
90  return _docIndex;
91  }
92  }
93 
97  public int NumDocs
98  {
99  get
100  {
101  return _docIndex.Count;
102  }
103  }
104 
108  public int NumTerms
109  {
110  get
111  {
112  return _termIndex.Count;
113  }
114  }
115 
116  #endregion
117 
118  #region Constructor
119 
124  public TermDocumentMatrix(params TLArtifactsCollection[] artifactsCollections)
125  {
126  _termIndex = new List<string>();
127  _docIndex = new List<string>();
128  _termIndexLookup = new Dictionary<string, int>();
129  _docIndexLookup = new Dictionary<string, int>();
130 
131  // create temporary corpus to build matrix with
132  Dictionary<string, Dictionary<string, double>> corpus = new Dictionary<string, Dictionary<string, double>>();
133  foreach (TLArtifactsCollection artifacts in artifactsCollections)
134  {
135  foreach (TLArtifact artifact in artifacts.Values)
136  {
137  // update document maps
138  _docIndex.Add(artifact.Id);
139  _docIndexLookup.Add(artifact.Id, _docIndex.Count - 1);
140  corpus.Add(artifact.Id, new Dictionary<string, double>());
141  foreach (string term in artifact.Text.Split())
142  {
143  if (!String.IsNullOrWhiteSpace(term))
144  {
145  // update term maps
146  if (!_termIndexLookup.ContainsKey(term))
147  {
148  _termIndex.Add(term);
149  _termIndexLookup.Add(term, _termIndex.Count - 1);
150  }
151  // update document counts
152  if (corpus[artifact.Id].ContainsKey(term))
153  {
154  corpus[artifact.Id][term]++;
155  }
156  else
157  {
158  corpus[artifact.Id].Add(term, 1);
159  }
160  }
161  }
162  }
163  }
164 
165  // build term-by-document matrix
166  _matrix = new double[_docIndex.Count][];
167  for (int i = 0; i < _docIndex.Count; i++)
168  {
169  _matrix[i] = new double[_termIndex.Count];
170  for (int j = 0; j < _termIndex.Count; j++)
171  {
172  corpus[_docIndex[i]].TryGetValue(_termIndex[j], out _matrix[i][j]);
173  }
174  }
175  }
176 
182  public TermDocumentMatrix(int docs, int terms)
183  {
184  _matrix = new double[docs][];
185  _docIndex = new List<string>();
186  _docIndexLookup = new Dictionary<string, int>();
187  for (int i = 0; i < docs; i++)
188  {
189  _matrix[i] = new double[terms];
190  _docIndex.Add("d" + i);
191  _docIndexLookup.Add("d" + i, i);
192  }
193  _termIndex = new List<string>();
194  _termIndexLookup = new Dictionary<string, int>();
195  for (int i = 0; i < terms; i++)
196  {
197  _termIndex.Add("t" + i);
198  _termIndexLookup.Add("t" + i, i);
199  }
200  }
201 
205  private TermDocumentMatrix() { }
206 
212  {
213  _matrix = new double[matrix.NumDocs][];
214  for (int i = 0; i < matrix.NumDocs; i++)
215  {
216  _matrix[i] = new double[matrix.NumTerms];
217  for (int j = 0; j < matrix.NumTerms; j++)
218  {
219  _matrix[i][j] = matrix[i,j];
220  }
221  }
222  _docIndex = new List<string>(matrix._docIndex);
223  _docIndexLookup = new Dictionary<string, int>(matrix._docIndexLookup);
224  _termIndex = new List<string>(matrix._termIndex);
225  _termIndexLookup = new Dictionary<string, int>(matrix._termIndexLookup);
226  }
227 
228  #endregion
229 
230  #region Getters
231 
237  public double[] GetDocument(int index)
238  {
239  return _matrix[index];
240  }
241 
247  public double[] GetDocument(string artifactID)
248  {
249  return GetDocument(_docIndexLookup[artifactID]);
250  }
251 
258  public double GetValue(int doc, int term)
259  {
260  return _matrix[doc][term];
261  }
262 
269  public double GetValue(string artifactID, string term)
270  {
271  return GetValue(_docIndexLookup[artifactID], _termIndexLookup[term]);
272  }
273 
279  public int GetTermIndex(string term)
280  {
281  return _termIndexLookup[term];
282  }
283 
289  public string GetTermName(int index)
290  {
291  return _termIndex[index];
292  }
293 
299  public int GetDocumentIndex(string artifactID)
300  {
301  return _docIndexLookup[artifactID];
302  }
303 
309  public string GetDocumentName(int index)
310  {
311  return _docIndex[index];
312  }
313 
314  #endregion
315 
316  #region Setters
317 
323  public void SetDocument(int index, double[] doc)
324  {
325  if (doc.Length != _matrix[index].Length)
326  throw new ArgumentException("The array sizes do not match.");
327  _matrix[index] = doc;
328  }
329 
335  public void SetDocument(string artifactID, double[] doc)
336  {
337  SetDocument(_docIndexLookup[artifactID], doc);
338  }
339 
346  public void SetValue(int doc, int term, double value)
347  {
348  _matrix[doc][term] = value;
349  }
350 
357  public void SetValue(string artifactID, string term, double value)
358  {
359  SetValue(_docIndexLookup[artifactID], _termIndexLookup[term], value);
360  }
361 
366  public void SetMatrix(double[][] matrix)
367  {
368  if (matrix.GetLength(0) != _matrix.GetLength(0))
369  throw new ArgumentException("The matrix has the wrong number of rows.");
370  for (int i = 0; i < _matrix.GetLength(0); i++)
371  {
372  if (matrix[i].Length != _matrix[i].Length)
373  throw new ArgumentException("The matrix has the wrong number of columns in row " + i + ".");
374  }
375  _matrix = matrix;
376  }
377 
378  #endregion
379 
380  #region Queries
381 
387  public bool ContainsDocument(string artifactID)
388  {
389  return _docIndexLookup.ContainsKey(artifactID);
390  }
391 
397  public bool ContainsTerm(string term)
398  {
399  return _termIndexLookup.ContainsKey(term);
400  }
401 
407  public TLArtifactsCollection ToTLArtifactsCollection()
408  {
409  TLArtifactsCollection artifacts = new TLArtifactsCollection();
410  for (int i = 0; i < NumDocs; i++)
411  {
412  List<string> text = new List<string>();
413  for (int j = 0; j < NumTerms; j++)
414  {
415  if (_matrix[i][j] > 0.0 && Math.Abs(_matrix[i][j] - Math.Round(_matrix[i][j])) < Settings.Default.DoubleTolerance)
416  {
417  int kmax = Convert.ToInt32(Math.Round(_matrix[i][j]));
418  for (int k = 0; k < kmax; k++)
419  {
420  text.Add(GetTermName(j));
421  }
422  }
423  }
424  artifacts.Add(new TLArtifact(GetDocumentName(i), String.Join(" ", text)));
425  }
426  return artifacts;
427  }
428 
429  #endregion
430 
431  #region Static utilities
432 
441  public static List<TermDocumentMatrix> Equalize(TermDocumentMatrix matrix1, TermDocumentMatrix matrix2)
442  {
443  // initialize matrices
444  List<TermDocumentMatrix> matrices = new List<TermDocumentMatrix>();
445  // matrix 1
446  matrices.Add(new TermDocumentMatrix());
447  matrices[0]._matrix = new double[matrix1.NumDocs][];
448  matrices[0]._docIndex = new List<string>(matrix1._docIndex);
449  matrices[0]._docIndexLookup = new Dictionary<string,int>(matrix1._docIndexLookup);
450  // matrix 2
451  matrices.Add(new TermDocumentMatrix());
452  matrices[1]._matrix = new double[matrix2.NumDocs][];
453  matrices[1]._docIndex = new List<string>(matrix2._docIndex);
454  matrices[1]._docIndexLookup = new Dictionary<string,int>(matrix2._docIndexLookup);
455  // compute term set
456  List<string> termIndex = new List<string>();
457  Dictionary<string, int> termIndexLookup = new Dictionary<string, int>();
458  Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
459  // get all terms in first matrix
460  foreach (string term in matrix1._termIndex)
461  {
462  termIndex.Add(term);
463  termIndexLookup.Add(term, termIndex.Count - 1);
464  // remove duplicate terms
465  if (matrix2._termIndexLookup.ContainsKey(term))
466  {
467  leftovers.Remove(term);
468  }
469  }
470  // add leftovers
471  foreach (string term in leftovers.Keys)
472  {
473  termIndex.Add(term);
474  termIndexLookup.Add(term, termIndex.Count - 1);
475  }
476  // create new term distributions for each document
477  // matrix 1
478  matrices[0]._termIndex = new List<string>(termIndex);
479  matrices[0]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
480  for (int i = 0; i < matrices[0].NumDocs; i++)
481  {
482  matrices[0]._matrix[i] = new double[termIndex.Count];
483  // fill in original values
484  for (int j = 0; j < matrix1.NumTerms; j++)
485  {
486  matrices[0][i, j] = matrix1[i, j];
487  }
488  // fill in missing terms
489  for (int j = matrix1.NumTerms; j < termIndex.Count; j++)
490  {
491  matrices[0][i, j] = 0.0;
492  }
493  }
494  // matrix 2
495  matrices[1]._termIndex = new List<string>(termIndex);
496  matrices[1]._termIndexLookup = new Dictionary<string,int>(termIndexLookup);
497  for (int i = 0; i < matrices[1].NumDocs; i++)
498  {
499  matrices[1]._matrix[i] = new double[termIndex.Count];
500  // fill in values
501  for (int j = 0; j < termIndex.Count; j++)
502  {
503  if (matrix2.ContainsTerm(termIndex[j]))
504  {
505  matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
506  }
507  else
508  {
509  matrices[1][i, j] = 0.0;
510  }
511  }
512  }
513  // return
514  return matrices;
515  }
516 
527  public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, int document1, TermDocumentMatrix matrix2, int document2)
528  {
529  // initialize new TermDocumentMatrix
530  TermDocumentMatrix newmatrix = new TermDocumentMatrix();
531  newmatrix._matrix = new double[2][];
532  newmatrix._termIndex = new List<string>();
533  newmatrix._termIndexLookup = new Dictionary<string, int>();
534  newmatrix._docIndex = new List<string>();
535  newmatrix._docIndexLookup = new Dictionary<string, int>();
536  newmatrix._docIndex.Add(matrix1.GetDocumentName(document1));
537  newmatrix._docIndexLookup.Add(matrix1.GetDocumentName(document1), newmatrix._docIndex.Count - 1);
538  newmatrix._docIndex.Add(matrix2.GetDocumentName(document2));
539  newmatrix._docIndexLookup.Add(matrix2.GetDocumentName(document2), newmatrix._docIndex.Count - 1);
540  List<double> doc1 = new List<double>();
541  List<double> doc2 = new List<double>();
542  // compute total term set
543  Dictionary<string, int> leftovers = new Dictionary<string,int>(matrix2._termIndexLookup);
544  foreach (string term in matrix1._termIndex)
545  {
546  newmatrix._termIndex.Add(term);
547  newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
548  doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
549  if (matrix2._termIndexLookup.ContainsKey(term))
550  {
551  leftovers.Remove(term);
552  doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
553  }
554  else
555  {
556  doc2.Add(0.0);
557  }
558  }
559  foreach (string term in leftovers.Keys)
560  {
561  newmatrix._termIndex.Add(term);
562  newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
563  doc1.Add(0.0);
564  doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
565  }
566  newmatrix._matrix[0] = doc1.ToArray();
567  newmatrix._matrix[1] = doc2.ToArray();
568  return newmatrix;
569  }
570 
579  public static TermDocumentMatrix EqualizeDocuments(TermDocumentMatrix matrix1, string artifact1, TermDocumentMatrix matrix2, string artifact2)
580  {
581  return EqualizeDocuments(matrix1, matrix1.GetDocumentIndex(artifact1), matrix2, matrix2.GetDocumentIndex(artifact2));
582  }
583 
592  {
593  TermDocumentMatrix combined = new TermDocumentMatrix();
594  // add documents
595  combined._docIndex = new List<string>(matrix1._docIndex);
596  combined._docIndexLookup = new Dictionary<string, int>(matrix1._docIndexLookup);
597  foreach (string doc in matrix2.DocMap)
598  {
599  combined._docIndex.Add(doc);
600  combined._docIndexLookup.Add(doc, combined.NumDocs - 1);
601  }
602  // calculate union of terms
603  combined._termIndex = new List<string>(matrix1._termIndex);
604  combined._termIndexLookup = new Dictionary<string, int>(matrix1._termIndexLookup);
605  foreach (string term in matrix2.TermMap)
606  {
607  if (!combined._termIndexLookup.ContainsKey(term))
608  {
609  combined._termIndex.Add(term);
610  combined._termIndexLookup.Add(term, combined.NumTerms - 1);
611  }
612  }
613  // create and populate matrix
614  combined._matrix = new double[combined.NumDocs][];
615  // matrix1
616  for (int i = 0; i < matrix1.NumDocs; i++)
617  {
618  combined._matrix[i] = new double[combined.NumTerms];
619  for (int j = 0; j < combined.NumTerms; j++)
620  {
621  if (matrix1.ContainsTerm(combined.TermMap[j]))
622  {
623  combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])];
624  }
625  else
626  {
627  combined[i, j] = 0.0;
628  }
629  }
630  }
631  // matrix2
632  for (int i = matrix1.NumDocs; i < combined.NumDocs; i++)
633  {
634  combined._matrix[i] = new double[combined.NumTerms];
635  for (int j = 0; j < combined.NumTerms; j++)
636  {
637  if (matrix2.ContainsTerm(combined.TermMap[j]))
638  {
639  combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])];
640  }
641  else
642  {
643  combined[i, j] = 0.0;
644  }
645  }
646  }
647  return combined;
648  }
649 
650  #endregion
651 
652  #region Static I/O
653 
657  private static string IODelimeter = " ";
658 
664  public static void Save(TermDocumentMatrix matrix, string filename)
665  {
666  // attempt to create file
667  TextWriter tw = new StreamWriter(File.Open(filename, FileMode.Create));
668  // print out term list
669  foreach (string term in matrix.TermMap)
670  {
671  tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, term);
672  }
673  tw.WriteLine();
674  // print out each document
675  for (int i = 0; i < matrix.NumDocs; i++)
676  {
677  tw.Write(matrix.GetDocumentName(i));
678  // print out each term
679  for (int j = 0; j < matrix.NumTerms; j++)
680  {
681  tw.Write("{0}{1}", TermDocumentMatrix.IODelimeter, matrix[i, j]);
682  }
683  tw.WriteLine();
684  }
685  // close file
686  tw.Flush();
687  tw.Close();
688  }
689 
695  public static TermDocumentMatrix Load(string filename)
696  {
697  TextReader tr = new StreamReader(File.OpenRead(filename));
698  TermDocumentMatrix matrix = new TermDocumentMatrix();
699  int lineNum = 1;
700  string line = tr.ReadLine();
701  string[] delimeter = new string[] { TermDocumentMatrix.IODelimeter, " ", "\t" };
702  // read terms
703 // testing
704  List<string> termList = new List<string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries));
705  // check for identical terms
706  HashSet<string> termSet = new HashSet<string>();
707  for (int i = 0; i < termList.Count; i++)
708  {
709  if (termSet.Contains(termList[i]))
710  {
711  termList[i] = termList[i] + new Random().Next();
712  i--;
713  }
714  else
715  {
716  termSet.Add(termList[i]);
717  }
718  }
719 // end of testing
720  // add to matrix
721  matrix._termIndex = termList;
722  matrix._termIndexLookup = new Dictionary<string, int>();
723  for (int i = 0; i < matrix._termIndex.Count; i++)
724  {
725  matrix._termIndexLookup.Add(matrix._termIndex[i], i);
726  }
727  // read documents
728  matrix._docIndex = new List<string>();
729  matrix._docIndexLookup = new Dictionary<string, int>();
730  List<double[]> docs = new List<double[]>();
731  while ((line = tr.ReadLine()) != null)
732  {
733  lineNum++;
734  string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries);
735  if (document.Length != matrix.NumTerms + 1)
736  {
737  tr.Close();
738  throw new InvalidDataException("Incorrect data format on line " + lineNum + " in file: " + filename);
739  }
740  matrix._docIndex.Add(document[0]);
741  matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1);
742  double[] doc = new double[matrix.NumTerms];
743  for (int i = 1; i < document.Length; i++)
744  {
745  doc[i - 1] = Convert.ToDouble(document[i]);
746  }
747  docs.Add(doc);
748  }
749  // add documents
750  matrix._matrix = new double[matrix.NumDocs][];
751  for (int i = 0; i < matrix.NumDocs; i++)
752  {
753  matrix._matrix[i] = new double[matrix.NumTerms];
754  for (int j = 0; j < matrix.NumTerms; j++)
755  {
756  matrix[i, j] = docs[i][j];
757  }
758  }
759  // cleanup
760  tr.Close();
761  return matrix;
762  }
763 
770  public static TermDocumentMatrix LoadTransposed(string filename)
771  {
772  TermDocumentMatrix original = Load(filename);
774  t._matrix = new double[original.NumTerms][];
775  t._docIndex = new List<string>();
776  t._docIndexLookup = new Dictionary<string, int>();
777  for (int i = 0; i < original.NumTerms; i++)
778  {
779  t._matrix[i] = new double[original.NumDocs];
780  t._docIndex.Add(original._termIndex[i]);
781  t._docIndexLookup.Add(original._termIndex[i], i);
782  }
783  t._termIndex = new List<string>();
784  t._termIndexLookup = new Dictionary<string, int>();
785  for (int i = 0; i < original.NumDocs; i++)
786  {
787  t._termIndex.Add(original._docIndex[i]);
788  t._termIndexLookup.Add(original._docIndex[i], i);
789  }
790  for (int i = 0; i < original.NumTerms; i++)
791  {
792  for (int j = 0; j < original.NumDocs; j++)
793  {
794  t._matrix[i][j] = original._matrix[j][i];
795  }
796  }
797  return t;
798  }
799 
800  #endregion
801  }
802 }