TraceLab Component Library
 All Classes Namespaces Files Functions Variables Enumerations Enumerator Properties
SimilarityUtil.cs
Go to the documentation of this file.
1 // TraceLab Component Library
2 // Copyright © 2012-2013 SEMERU
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 
17 using System;
18 using System.Collections.Generic;
19 using System.Linq;
20 using System.Text;
21 using TraceLabSDK.Types;
22 
23 namespace TraceLab.Components.DevelopmentKit.Utils.TermDocumentMatrixUtils
24 {
28  public static class SimilarityUtil
29  {
36  public static double ComputeDotProduct(double[] vec1, double[] vec2)
37  {
38  if (vec1.Length != vec2.Length)
39  {
40  throw new DevelopmentKitException("Vectors must be of equal length.");
41  }
42  double product = 0.0;
43  for (int i = 0; i < vec1.Length; i++)
44  {
45  product += (vec1[i] * vec2[i]);
46  }
47  return product;
48  }
49 
55  public static double ComputeLength(double[] vector)
56  {
57  double length = 0.0;
58  for (int i = 0; i < vector.Length; i++)
59  {
60  length += Math.Pow(vector[i], 2);
61  }
62  return Math.Sqrt(length);
63  }
64 
72  public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix m1, TermDocumentMatrix m2)
73  {
74  TLSimilarityMatrix sims = new TLSimilarityMatrix();
75  List<TermDocumentMatrix> matrices = TermDocumentMatrix.Equalize(m1, m2);
76  for (int i = 0; i < m1.NumDocs; i++)
77  {
78  TLLinksList links = new TLLinksList();
79  for (int j = 0; j < m2.NumDocs; j++)
80  {
81  double lengthProduct = ComputeLength(matrices[0].GetDocument(i)) * ComputeLength(matrices[1].GetDocument(j));
82  if (lengthProduct == 0.0)
83  {
84  links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), 0.0));
85  }
86  else
87  {
88  links.Add(new TLSingleLink(m1.GetDocumentName(i), m2.GetDocumentName(j), ComputeDotProduct(matrices[0].GetDocument(i), matrices[1].GetDocument(j)) / lengthProduct));
89  }
90  }
91  links.Sort();
92  foreach (TLSingleLink link in links)
93  {
94  sims.AddLink(link.SourceArtifactId, link.TargetArtifactId, link.Score);
95  }
96  }
97  return sims;
98  }
99 
107  public static TLSimilarityMatrix ComputeCosine(TermDocumentMatrix matrix, IEnumerable<string> sourceIDs, IEnumerable<string> targetIDs)
108  {
109  TLSimilarityMatrix sims = new TLSimilarityMatrix();
110  foreach (string sourceID in sourceIDs)
111  {
112  double[] sourceDoc = matrix.GetDocument(sourceID);
113  foreach (string targetID in targetIDs)
114  {
115  // compute cosine similarity between source and target
116  double[] targetDoc = matrix.GetDocument(targetID);
117  double lengthProduct = ComputeLength(sourceDoc) * ComputeLength(targetDoc);
118  if (lengthProduct == 0.0)
119  {
120  sims.AddLink(sourceID, targetID, 0.0);
121  }
122  else
123  {
124  double score = ComputeDotProduct(sourceDoc, targetDoc) / lengthProduct;
125  sims.AddLink(sourceID, targetID, score);
126  }
127  }
128  }
129  return sims;
130  }
131  }
132 }