TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
LDACorpus.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.IO;
20 using System.Linq;
21 using System.Text;
22 using TraceLabSDK.Types;
23 using TraceLab.Components.DevelopmentKit;
24 using RPlugin.Core;
25 
26 namespace TraceLab.Components.DevelopmentKit.Tracers.InformationRetrieval
27 {
31  public class LDACorpus
32  {
33  #region Private members
34 
35  private TermDocumentMatrix _matrix;
36  private IEnumerable<string> _sourceDocs;
37  private IEnumerable<string> _targetDocs;
38 
39  #endregion
40 
41  #region Constructor
42 
49  public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target)
50  {
51  Name = name;
52  TermDocumentMatrix sMatrix = new TermDocumentMatrix(source);
53  TermDocumentMatrix tMatrix = new TermDocumentMatrix(target);
54  _sourceDocs = sMatrix.DocMap;
55  _targetDocs = tMatrix.DocMap;
56  _matrix = TermDocumentMatrix.Combine(sMatrix, tMatrix);
57  }
58 
65  public LDACorpus(string name, TermDocumentMatrix source, TermDocumentMatrix target)
66  {
67  Name = name;
68  _sourceDocs = source.DocMap;
69  _targetDocs = target.DocMap;
70  _matrix = TermDocumentMatrix.Combine(source, target);
71  }
72 
80  public LDACorpus(string name, TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs)
81  {
82  Name = name;
83  _sourceDocs = sourceIDs;
84  _targetDocs = targetIDs;
85  _matrix = matrix;
86  }
87 
88  #endregion
89 
90  #region Public accessors
91 
95  public string Name { get; private set; }
96 
100  public string Edges
101  {
102  get
103  {
104  StringBuilder edges = new StringBuilder();
105  foreach (string sourceID in _sourceDocs)
106  {
107  foreach (string targetID in _targetDocs)
108  {
109  edges.AppendFormat("{0} {1}", _matrix.GetDocumentIndex(sourceID), _matrix.GetDocumentIndex(targetID));
110  edges.AppendLine();
111  }
112  }
113  return edges.ToString();
114  }
115  }
116 
120  public string Links
121  {
122  get
123  {
124  StringBuilder list = new StringBuilder("list(");
125  List<string> links = new List<string>();
126  for (int i = 0; i < _matrix.NumDocs; i++)
127  {
128  links.Add("integer(0)");
129  }
130  list.Append(String.Join(",", links));
131  list.Append(")");
132  return list.ToString();
133  }
134  }
135 
139  public string Vocab
140  {
141  get
142  {
143  return "c(\"" + String.Join("\",\"", _matrix.TermMap) + "\")";
144  }
145  }
146 
165  public string Matrix
166  {
167  get
168  {
169  StringBuilder tdoc = new StringBuilder("list(");
170  tdoc.AppendLine();
171  for (int i = 0; i < _matrix.NumDocs; i++)
172  {
173  tdoc.Append("structure(c(");
174  List<string> entries = new List<string>();
175  for (int j = 0; j < _matrix.NumTerms; j++)
176  {
177  int freq = Convert.ToInt32(_matrix[i, j]);
178  for (int k = 1; k <= freq; k++)
179  {
180  entries.Add(j + "L,1L");
181  }
182  }
183  tdoc.Append(String.Join(",", entries));
184  tdoc.Append("), .Dim = c(2L, " + Convert.ToInt32(_matrix.GetDocument(i).Sum()) + "L))");
185  if (i < _matrix.NumDocs - 1)
186  {
187  tdoc.AppendLine(",");
188  }
189  }
190  tdoc.AppendLine();
191  tdoc.AppendLine(")");
192  return tdoc.ToString();
193  }
194  }
195 
199  public List<string> Map
200  {
201  get
202  {
203  return _matrix.DocMap;
204  }
205  }
206 
207  #endregion
208 
209  #region I/O
210 
217  {
218  LDACorpusInfo info = new LDACorpusInfo();
219  info.Name = Name;
220  // write matrix
221  FileStream cFS = RUtil.CreateCacheFile(Name + ".corpus");
222  info.Corpus = cFS.Name;
223  TextWriter corpus = new StreamWriter(cFS);
224  corpus.Write(Matrix);
225  corpus.Flush();
226  corpus.Close();
227  // write vocab
228  FileStream vFS = RUtil.CreateCacheFile(Name + ".vocab");
229  info.Vocab = vFS.Name;
230  TextWriter vocab = new StreamWriter(vFS);
231  vocab.Write(Vocab);
232  vocab.Flush();
233  vocab.Close();
234  // write edges
235  FileStream eFS = RUtil.CreateCacheFile(Name + ".tableWriter");
236  info.Edges = eFS.Name;
237  TextWriter edges = new StreamWriter(eFS);
238  edges.Write(Edges);
239  edges.Flush();
240  edges.Close();
241  // write links
242  FileStream lFS = RUtil.CreateCacheFile(Name + ".links");
243  info.Links = lFS.Name;
244  TextWriter links = new StreamWriter(lFS);
245  links.Write(Links);
246  links.Flush();
247  links.Close();
248  // return info
249  return info;
250  }
251 
252  #endregion
253  }
254 
258  public class LDACorpusInfo
259  {
263  public string Name;
264 
268  public string Corpus;
269 
273  public string Vocab;
274 
278  public string Edges;
279 
283  public string Links;
284  }
285 }