TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
JSD.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using TraceLabSDK.Types;
20 
21 namespace TraceLab.Components.DevelopmentKit.Tracers.InformationRetrieval
22 {
26  public static class JSD
27  {
34  public static TLSimilarityMatrix Compute(TLArtifactsCollection source, TLArtifactsCollection target)
35  {
36  return Compute(new TermDocumentMatrix(source), new TermDocumentMatrix(target));
37  }
38 
45  public static TLSimilarityMatrix Compute(TermDocumentMatrix source, TermDocumentMatrix target)
46  {
47  List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(source, target);
48  TLSimilarityMatrix sims = new TLSimilarityMatrix();
49  for (int i = 0; i < matrices[0].NumDocs; i++)
50  {
51  TLLinksList list = new TLLinksList();
52  for (int j = 0; j < matrices[1].NumDocs; j++)
53  {
54  list.Add(new TLSingleLink(matrices[0].GetDocumentName(i), matrices[1].GetDocumentName(j),
55  DocumentSimilarity(matrices[0].GetDocument(i), matrices[1].GetDocument(j))));
56  }
57  list.Sort();
58  foreach (TLSingleLink link in list)
59  {
60  sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
61  }
62  }
63  return sims;
64  }
65 
72  public static double DocumentSimilarity(double[] document1, double[] document2)
73  {
74  double similarity;
75  //1. Transform documents in two probability distributions
76  double[] distribution1 = new double[document1.Length];
77  double[] distribution2 = new double[document2.Length];
78  double sum1 = 0, sum2 = 0;
79  for (int i = 0; i < document1.Length; i++)
80  {
81  sum1 = sum1 + document1[i];
82  sum2 = sum2 + document2[i];
83  }
84  for (int i = 0; i < document1.Length; i++)
85  {
86  distribution1[i] = document1[i] / sum1;
87  distribution2[i] = document2[i] / sum2;
88  }
89  //2. Compute Jensen-Shannon divergence between probability distribution
90  double[] temp;
91  temp = sumDocument(distribution1, distribution2);
92  temp = mulDocument(0.5, temp);
93  similarity = entropy(temp);
94  similarity = similarity - (entropy(distribution1) + entropy(distribution2)) / 2;
95  //3. Compute Jensen Shannon similarity
96  similarity = 1 - similarity;
97  return similarity;
98  }
99 
105  public static double entropy(double[] docDistrib)
106  {
107  int i;
108  double entropia = 0;
109  for (i = 0; i < docDistrib.Length; i++)
110  {
111  if (docDistrib[i] > 0)
112  {
113  entropia = entropia - docDistrib[i] * Math.Log(docDistrib[i], 2);
114  }
115  }
116 
117  return entropia;
118  }
119 
126  public static double[] sumDocument(double[] document1, double[] document2)
127  {
128  double[] sum = new double[document1.Length];
129 
130  for (int i = 0; i < sum.Length; i++)
131  {
132  sum[i] = document1[i] + document2[i];
133  }
134 
135  return sum;
136  }
137 
144  public static double[] mulDocument(double pScalar, double[] pVector)
145  {
146  double[] mul = new double[pVector.Length];
147  for (int i = 0; i < mul.Length; i++)
148  mul[i] = pScalar * pVector[i];
149  return mul;
150  }
151  }
152 }