18 using System.Collections.Generic;
20 using TraceLab.Components.DevelopmentKit.Properties;
22 using TraceLabSDK.Types;
24 namespace TraceLab.Components.DevelopmentKit
31 #region Private members
32 private double[][] _matrix;
33 private List<string> _termIndex;
34 private List<string> _docIndex;
35 private Dictionary<string, int> _termIndexLookup;
36 private Dictionary<string, int> _docIndexLookup;
39 #region Public accessors
47 public double this[
int docindex,
int termindex]
51 return _matrix[docindex][termindex];
55 _matrix[docindex][termindex] = value;
64 public double[][] RawMatrix
75 public List<string> TermMap
86 public List<string> DocMap
101 return _docIndex.Count;
112 return _termIndex.Count;
126 _termIndex =
new List<string>();
127 _docIndex =
new List<string>();
128 _termIndexLookup =
new Dictionary<string, int>();
129 _docIndexLookup =
new Dictionary<string, int>();
132 Dictionary<string, Dictionary<string, double>> corpus =
new Dictionary<string, Dictionary<string, double>>();
133 foreach (TLArtifactsCollection artifacts
in artifactsCollections)
135 foreach (TLArtifact artifact
in artifacts.Values)
138 _docIndex.Add(artifact.Id);
139 _docIndexLookup.Add(artifact.Id, _docIndex.Count - 1);
140 corpus.Add(artifact.Id,
new Dictionary<string, double>());
141 foreach (
string term
in artifact.Text.Split())
143 if (!String.IsNullOrWhiteSpace(term))
146 if (!_termIndexLookup.ContainsKey(term))
148 _termIndex.Add(term);
149 _termIndexLookup.Add(term, _termIndex.Count - 1);
152 if (corpus[artifact.Id].ContainsKey(term))
154 corpus[artifact.Id][term]++;
158 corpus[artifact.Id].Add(term, 1);
166 _matrix =
new double[_docIndex.Count][];
167 for (
int i = 0; i < _docIndex.Count; i++)
169 _matrix[i] =
new double[_termIndex.Count];
170 for (
int j = 0; j < _termIndex.Count; j++)
172 corpus[_docIndex[i]].TryGetValue(_termIndex[j], out _matrix[i][j]);
184 _matrix =
new double[docs][];
185 _docIndex =
new List<string>();
186 _docIndexLookup =
new Dictionary<string, int>();
187 for (
int i = 0; i < docs; i++)
189 _matrix[i] =
new double[terms];
190 _docIndex.Add(
"d" + i);
191 _docIndexLookup.Add(
"d" + i, i);
193 _termIndex =
new List<string>();
194 _termIndexLookup =
new Dictionary<string, int>();
195 for (
int i = 0; i < terms; i++)
197 _termIndex.Add(
"t" + i);
198 _termIndexLookup.Add(
"t" + i, i);
213 _matrix =
new double[matrix.
NumDocs][];
214 for (
int i = 0; i < matrix.
NumDocs; i++)
216 _matrix[i] =
new double[matrix.
NumTerms];
217 for (
int j = 0; j < matrix.
NumTerms; j++)
219 _matrix[i][j] = matrix[i,j];
222 _docIndex =
new List<string>(matrix._docIndex);
223 _docIndexLookup =
new Dictionary<string, int>(matrix._docIndexLookup);
224 _termIndex =
new List<string>(matrix._termIndex);
225 _termIndexLookup =
new Dictionary<string, int>(matrix._termIndexLookup);
239 return _matrix[index];
249 return GetDocument(_docIndexLookup[artifactID]);
260 return _matrix[doc][term];
269 public double GetValue(
string artifactID,
string term)
271 return GetValue(_docIndexLookup[artifactID], _termIndexLookup[term]);
281 return _termIndexLookup[term];
291 return _termIndex[index];
301 return _docIndexLookup[artifactID];
311 return _docIndex[index];
325 if (doc.Length != _matrix[index].Length)
326 throw new ArgumentException(
"The array sizes do not match.");
327 _matrix[index] = doc;
337 SetDocument(_docIndexLookup[artifactID], doc);
346 public void SetValue(
int doc,
int term,
double value)
348 _matrix[doc][term] = value;
357 public void SetValue(
string artifactID,
string term,
double value)
359 SetValue(_docIndexLookup[artifactID], _termIndexLookup[term], value);
368 if (matrix.GetLength(0) != _matrix.GetLength(0))
369 throw new ArgumentException(
"The matrix has the wrong number of rows.");
370 for (
int i = 0; i < _matrix.GetLength(0); i++)
372 if (matrix[i].Length != _matrix[i].Length)
373 throw new ArgumentException(
"The matrix has the wrong number of columns in row " + i +
".");
389 return _docIndexLookup.ContainsKey(artifactID);
399 return _termIndexLookup.ContainsKey(term);
409 TLArtifactsCollection artifacts =
new TLArtifactsCollection();
410 for (
int i = 0; i < NumDocs; i++)
412 List<string> text =
new List<string>();
413 for (
int j = 0; j < NumTerms; j++)
415 if (_matrix[i][j] > 0.0 && Math.Abs(_matrix[i][j] - Math.Round(_matrix[i][j])) < Settings.Default.DoubleTolerance)
417 int kmax = Convert.ToInt32(Math.Round(_matrix[i][j]));
418 for (
int k = 0; k < kmax; k++)
420 text.Add(GetTermName(j));
424 artifacts.Add(
new TLArtifact(GetDocumentName(i), String.Join(
" ", text)));
431 #region Static utilities
444 List<TermDocumentMatrix> matrices =
new List<TermDocumentMatrix>();
447 matrices[0]._matrix =
new double[matrix1.
NumDocs][];
448 matrices[0]._docIndex =
new List<string>(matrix1._docIndex);
449 matrices[0]._docIndexLookup =
new Dictionary<string,int>(matrix1._docIndexLookup);
452 matrices[1]._matrix =
new double[matrix2.
NumDocs][];
453 matrices[1]._docIndex =
new List<string>(matrix2._docIndex);
454 matrices[1]._docIndexLookup =
new Dictionary<string,int>(matrix2._docIndexLookup);
456 List<string> termIndex =
new List<string>();
457 Dictionary<string, int> termIndexLookup =
new Dictionary<string, int>();
458 Dictionary<string, int> leftovers =
new Dictionary<string,int>(matrix2._termIndexLookup);
460 foreach (
string term
in matrix1._termIndex)
463 termIndexLookup.Add(term, termIndex.Count - 1);
465 if (matrix2._termIndexLookup.ContainsKey(term))
467 leftovers.Remove(term);
471 foreach (
string term
in leftovers.Keys)
474 termIndexLookup.Add(term, termIndex.Count - 1);
478 matrices[0]._termIndex =
new List<string>(termIndex);
479 matrices[0]._termIndexLookup =
new Dictionary<string,int>(termIndexLookup);
480 for (
int i = 0; i < matrices[0].NumDocs; i++)
482 matrices[0]._matrix[i] =
new double[termIndex.Count];
484 for (
int j = 0; j < matrix1.NumTerms; j++)
486 matrices[0][i, j] = matrix1[i, j];
489 for (
int j = matrix1.NumTerms; j < termIndex.Count; j++)
491 matrices[0][i, j] = 0.0;
495 matrices[1]._termIndex =
new List<string>(termIndex);
496 matrices[1]._termIndexLookup =
new Dictionary<string,int>(termIndexLookup);
497 for (
int i = 0; i < matrices[1].NumDocs; i++)
499 matrices[1]._matrix[i] =
new double[termIndex.Count];
501 for (
int j = 0; j < termIndex.Count; j++)
503 if (matrix2.ContainsTerm(termIndex[j]))
505 matrices[1][i, j] = matrix2.GetValue(matrix2.GetDocumentName(i), termIndex[j]);
509 matrices[1][i, j] = 0.0;
531 newmatrix._matrix =
new double[2][];
532 newmatrix._termIndex =
new List<string>();
533 newmatrix._termIndexLookup =
new Dictionary<string, int>();
534 newmatrix._docIndex =
new List<string>();
535 newmatrix._docIndexLookup =
new Dictionary<string, int>();
537 newmatrix._docIndexLookup.Add(matrix1.
GetDocumentName(document1), newmatrix._docIndex.Count - 1);
539 newmatrix._docIndexLookup.Add(matrix2.
GetDocumentName(document2), newmatrix._docIndex.Count - 1);
540 List<double> doc1 =
new List<double>();
541 List<double> doc2 =
new List<double>();
543 Dictionary<string, int> leftovers =
new Dictionary<string,int>(matrix2._termIndexLookup);
544 foreach (
string term
in matrix1._termIndex)
546 newmatrix._termIndex.Add(term);
547 newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
548 doc1.Add(matrix1.GetValue(document1, matrix1.GetTermIndex(term)));
549 if (matrix2._termIndexLookup.ContainsKey(term))
551 leftovers.Remove(term);
552 doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
559 foreach (
string term
in leftovers.Keys)
561 newmatrix._termIndex.Add(term);
562 newmatrix._termIndexLookup.Add(term, newmatrix._termIndex.Count - 1);
564 doc2.Add(matrix2.GetValue(document2, matrix2.GetTermIndex(term)));
566 newmatrix._matrix[0] = doc1.ToArray();
567 newmatrix._matrix[1] = doc2.ToArray();
595 combined._docIndex =
new List<string>(matrix1._docIndex);
596 combined._docIndexLookup =
new Dictionary<string, int>(matrix1._docIndexLookup);
597 foreach (
string doc
in matrix2.DocMap)
599 combined._docIndex.Add(doc);
600 combined._docIndexLookup.Add(doc, combined.NumDocs - 1);
603 combined._termIndex =
new List<string>(matrix1._termIndex);
604 combined._termIndexLookup =
new Dictionary<string, int>(matrix1._termIndexLookup);
605 foreach (
string term
in matrix2.TermMap)
607 if (!combined._termIndexLookup.ContainsKey(term))
609 combined._termIndex.Add(term);
610 combined._termIndexLookup.Add(term, combined.NumTerms - 1);
614 combined._matrix =
new double[combined.NumDocs][];
616 for (
int i = 0; i < matrix1.NumDocs; i++)
618 combined._matrix[i] =
new double[combined.NumTerms];
619 for (
int j = 0; j < combined.NumTerms; j++)
621 if (matrix1.ContainsTerm(combined.TermMap[j]))
623 combined[i, j] = matrix1[i, matrix1.GetTermIndex(combined.TermMap[j])];
627 combined[i, j] = 0.0;
632 for (
int i = matrix1.NumDocs; i < combined.NumDocs; i++)
634 combined._matrix[i] =
new double[combined.NumTerms];
635 for (
int j = 0; j < combined.NumTerms; j++)
637 if (matrix2.ContainsTerm(combined.TermMap[j]))
639 combined[i, j] = matrix2[i - matrix1.NumDocs, matrix2.GetTermIndex(combined.TermMap[j])];
643 combined[i, j] = 0.0;
657 private static string IODelimeter =
" ";
667 TextWriter tw =
new StreamWriter(File.Open(filename, FileMode.Create));
669 foreach (
string term
in matrix.TermMap)
675 for (
int i = 0; i < matrix.NumDocs; i++)
677 tw.Write(matrix.GetDocumentName(i));
679 for (
int j = 0; j < matrix.NumTerms; j++)
697 TextReader tr =
new StreamReader(File.OpenRead(filename));
700 string line = tr.ReadLine();
704 List<string> termList =
new List<string>(line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries));
706 HashSet<string> termSet =
new HashSet<string>();
707 for (
int i = 0; i < termList.Count; i++)
709 if (termSet.Contains(termList[i]))
711 termList[i] = termList[i] +
new Random().Next();
716 termSet.Add(termList[i]);
721 matrix._termIndex = termList;
722 matrix._termIndexLookup =
new Dictionary<string, int>();
723 for (
int i = 0; i < matrix._termIndex.Count; i++)
725 matrix._termIndexLookup.Add(matrix._termIndex[i], i);
728 matrix._docIndex =
new List<string>();
729 matrix._docIndexLookup =
new Dictionary<string, int>();
730 List<double[]> docs =
new List<double[]>();
731 while ((line = tr.ReadLine()) != null)
734 string[] document = line.Split(delimeter, StringSplitOptions.RemoveEmptyEntries);
735 if (document.Length != matrix.NumTerms + 1)
738 throw new InvalidDataException(
"Incorrect data format on line " + lineNum +
" in file: " + filename);
740 matrix._docIndex.Add(document[0]);
741 matrix._docIndexLookup.Add(document[0], matrix._docIndex.Count - 1);
742 double[] doc =
new double[matrix.NumTerms];
743 for (
int i = 1; i < document.Length; i++)
745 doc[i - 1] = Convert.ToDouble(document[i]);
750 matrix._matrix =
new double[matrix.NumDocs][];
751 for (
int i = 0; i < matrix.NumDocs; i++)
753 matrix._matrix[i] =
new double[matrix.NumTerms];
754 for (
int j = 0; j < matrix.NumTerms; j++)
756 matrix[i, j] = docs[i][j];
774 t._matrix =
new double[original.
NumTerms][];
775 t._docIndex =
new List<string>();
776 t._docIndexLookup =
new Dictionary<string, int>();
777 for (
int i = 0; i < original.
NumTerms; i++)
779 t._matrix[i] =
new double[original.
NumDocs];
780 t._docIndex.Add(original._termIndex[i]);
781 t._docIndexLookup.Add(original._termIndex[i], i);
783 t._termIndex =
new List<string>();
784 t._termIndexLookup =
new Dictionary<string, int>();
785 for (
int i = 0; i < original.
NumDocs; i++)
787 t._termIndex.Add(original._docIndex[i]);
788 t._termIndexLookup.Add(original._docIndex[i], i);
790 for (
int i = 0; i < original.
NumTerms; i++)
792 for (
int j = 0; j < original.
NumDocs; j++)
794 t._matrix[i][j] = original._matrix[j][i];