TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
LDAScript.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using RPlugin.Core;
18 using RPlugin.Exceptions;
19 using System;
20 using System.Collections.Generic;
21 using System.IO;
22 using System.Reflection;
23 using TraceLab.Components.RPlugin.Properties;
24 using TraceLab.Components.Types.Tracers.InformationRetrieval;
25 using TraceLabSDK.Types;
26 
27 namespace TraceLab.Components.DevelopmentKit.Tracers.InformationRetrieval
28 {
32  public class LDAScript : RScript
33  {
34  private readonly string _baseScript = Settings.Default.Resources + "LDA.R";
35  private readonly string[] _requiredPackages = new string[] { "lda" };
36 
37  private TermDocumentMatrix _source;
38  private TermDocumentMatrix _target;
39  private LDAConfig _config;
40  private string _outputFile;
41 
45  public override string BaseScript
46  {
47  get
48  {
49  return _baseScript;
50  }
51  }
52 
56  public override string[] RequiredPackages
57  {
58  get
59  {
60  return _requiredPackages;
61  }
62  }
63 
70  public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config) : base()
71  {
72  _source = new TermDocumentMatrix(source);
73  _target = new TermDocumentMatrix(target);
74  _config = config;
75  }
76 
83  public LDAScript(TermDocumentMatrix source, TermDocumentMatrix target, LDAConfig config) : base()
84  {
85  _source = source;
86  _target = target;
87  _config = config;
88  }
89 
93  public override void PreCompute()
94  {
95  RUtil.RegisterScript(Assembly.GetExecutingAssembly(), _baseScript);
96  LDACorpus corpus = new LDACorpus("LDA", _source, _target);
97  LDACorpusInfo info = corpus.Save();
98  _outputFile = RUtil.ReserveCacheFile("LDA.out");
99  _arguments = new List<object>();
100  _arguments.Add(info.Corpus);
101  _arguments.Add(info.Vocab);
102  _arguments.Add(info.Edges);
103  _arguments.Add(_outputFile);
104  _arguments.Add(_config.NumTopics);
105  _arguments.Add(_config.NumIterations);
106  _arguments.Add(_config.Alpha);
107  _arguments.Add(_config.Eta);
108  _arguments.Add(_config.PredictionBeta);
109  _arguments.Add(_config.Seed);
110  }
111 
117  public override object ImportResults(RScriptResult result)
118  {
119  TextReader rfile = new StreamReader(_outputFile);
120  string rawdata = rfile.ReadToEnd();
121  rfile.Close();
122  TLSimilarityMatrix matrix = new TLSimilarityMatrix();
123  string[] sims = rawdata.Remove(0,2).Replace(")", String.Empty).Split(new char[] {','}, StringSplitOptions.RemoveEmptyEntries);
124  int src = 0;
125  int tgt = _source.DocMap.Count;
126  if (sims.Length != _source.DocMap.Count * _target.DocMap.Count)
127  {
128  throw new RDataException("Results are incorrect size: " + sims.Length + " vs " + (_source.DocMap.Count * _target.DocMap.Count));
129  }
130  foreach (string sim in sims)
131  {
132  matrix.AddLink(_source.DocMap[src], _target.DocMap[tgt - _source.DocMap.Count], Convert.ToDouble(sim.Trim()));
133  tgt++;
134  if (tgt == _source.DocMap.Count + _target.DocMap.Count)
135  {
136  tgt = _source.DocMap.Count;
137  src++;
138  }
139  }
140  return matrix;
141  }
142  }
143 }