0
Reply

measuring cosine similarity for multiple document n queries

Khizer Hayat

Khizer Hayat

Jun 23 2015 12:49 AM
680
static void Main(string[] args) { var result = getContent(); var length = result.GetLength(0); double[] doc1Array = new double[length]; double[] doc2Array = new double[length]; //first doc for (int i = 0; i < length; i++) {                 doc1Array[i] = result[i, 0]; } //second doc for (int i = 0; i < length; i++) {                 doc2Array[i] = result[i, 1]; } var cosSimilarity = CalculateCosineSimilarity(doc1Array, doc2Array); Console.WriteLine("Similarity between Query and Doc2: "); Console.WriteLine(cosSimilarity); Console.ReadKey(); } 

 private static double
public static double[,] GetWeights(List<List<string>> splitedDocuments, string[] unicalWords) { double[,] matrix = new double[unicalWords.Count(), splitedDocuments.Count]; for (int i = 0; i < splitedDocuments.Count; i++) { var document = splitedDocuments[i].GroupBy(g => g).Select(s => new { s.Key, s.ToList().Count }).ToDictionary(t => t.Key, t => t.Count); for (int j = 0; j < unicalWords.Count(); j++) { var key = unicalWords[j]; var value = document.Where(w => w.Key == key).FirstOrDefault().Key != null ? document.Where(w => w.Key == key).FirstOrDefault().Value : 0; var conteinsCount = splitedDocuments.Where(w => w.Contains(key)).Count(); var tf = (double)value / splitedDocuments[i].Count; var idf = 1 + Math.Log((double)splitedDocuments.Count / splitedDocuments.Where(w => w.Contains(key)).Count()); double weight = tf * idf;                      matrix[j, i] = weight; } } return matrix; } private static double CalculateCosineSimilarity(double[] vecA, double[] vecB) { var dotProduct = DotProduct(vecA, vecB); var magnitudeOfA = Magnitude(vecA); var magnitudeOfB = Magnitude(vecB); return dotProduct / (magnitudeOfA * magnitudeOfB); } private static double DotProduct(double[] vecA, double[] vecB) { double dotProduct = 0; for (var i = 0; i < vecA.Length; i++) {                 dotProduct += (vecA[i] * vecB[i]); } return dotProduct; } private static double Magnitude(double[] vector) { return Math.Sqrt(DotProduct(vector, vector)); }
[,] getContent() { List<List<string>> documents = new List<List<string>>(); string query = "life learning"; string documnet1 = "The game of life is a game of everlasting learning"; string documnet2 = "The unexamined life is not worth living"; string documnet3 = "Never stop learning"; var splitedQuery = query.Split(' ').ToList(); var splitedDocument1 = documnet1.Split(' ').ToList(); var splitedDocument2 = documnet2.Split(' ').ToList(); var splitedDocument3 = documnet3.Split(' ').ToList(); var unicalWords = (query + " " + document1 + " " + document2 + " " + document3).Split(' ').GroupBy(g => g).Select(s => s.Key).ToArray(); documents.Add(splitedQuery); documents.Add(splitedDocument1); documents.Add(splitedDocument2); documents.Add(splitedDocument3); var array = GetWeights(documents, unicalWords); return array;
I want to check similarity of one document with multiple queries, store it an array sort by descending.