from sklearn.metrics import precision_recall_curve from sklearn.metrics import average_precision_score from elasticsearch import Elasticsearch from elasticsearch.exceptions import TransportError import json import numpy as np from pandas import json_normalize import pandas as pd import rank_metric as metrics import sys import os IN_COLAB = 'google.colab' in sys.modules if IN_COLAB: ROOT = '/content/ProjectoRI2020' else: ROOT = '.' class ConvSearchEvaluation: def __init__(self): # Training topics with open(os.path.join(ROOT, "data/training/train_topics_v1.0.json"), "rt", encoding="utf-8") as f: self.train_topics = json.load(f) # fields: topic_turn_id, docid, rel self.relevance_judgments = pd.read_csv(os.path.join(ROOT, "data/training/train_topics_mod.qrel"), sep=' ', names=["topic_turn_id", "dummy", "docid", "rel"]) # Test topics with open(os.path.join(ROOT, "data/evaluation/evaluation_topics_v1.0.json"), "rt", encoding="utf-8") as f: self.test_topics = json.load(f) # fields: topic_turn_id, docid, rel self.test_relevance_judgments = pd.read_csv(os.path.join(ROOT, "data/evaluation/evaluation_topics_mod.qrel"), sep=' ', names=["topic_turn_id", "dummy", "docid", "rel"]) set_of_conversations = set(self.relevance_judgments['topic_turn_id']) self.judged_conversations = np.unique([a.split('_', 1)[0] for a in set_of_conversations]) def eval(self, result, topic_turn_id): total_retrieved_docs = result.count()[0] # Try to get the relevance judgments from the TRAINING set aux = self.relevance_judgments.loc[self.relevance_judgments['topic_turn_id'] == (topic_turn_id)] # IF fail, try to get the relevance judgments from the TEST set if np.size(aux) == 0: aux = self.test_relevance_judgments.loc[self.test_relevance_judgments['topic_turn_id'] == (topic_turn_id)] rel_docs = aux.loc[aux['rel'] != 0] query_rel_docs = rel_docs['docid'] relv_judg_list = rel_docs['rel'] total_relevant = relv_judg_list.count() NUMBER_OF_MEASURES = 10 precisions = np.zeros(NUMBER_OF_MEASURES) recalls = np.zeros(NUMBER_OF_MEASURES) # calculate p@ and r@ for percentage in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: k = int(percentage * total_retrieved_docs) pos = int(percentage * 10) - 1 topk = result['_id'][:k] true_pos = np.intersect1d(topk, query_rel_docs) precisions[pos] = np.size(true_pos) / k recalls[pos] = np.size(true_pos) / total_relevant true_pos = np.intersect1d(result['_id'], query_rel_docs) recall = np.size(true_pos) / total_relevant # Compute vector of results with corresponding relevance level relev_judg_results = np.zeros((total_retrieved_docs, 1)) for index, doc in rel_docs.iterrows(): relev_judg_results = relev_judg_results + ((result['_id'] == doc.docid) * doc.rel).to_numpy() # Normalized Discount Cummulative Gain p10 = metrics.precision_at_k(relev_judg_results[0], 10) ndcg5 = metrics.ndcg_at_k(r=relev_judg_results[0], k=5, method=1) ap = metrics.average_precision(relev_judg_results[0], total_relevant) mrr = metrics.mean_reciprocal_rank(relev_judg_results[0]) return [p10, recall, ap, ndcg5, precisions, recalls]