import numpy as np import plots as plots import pickle as pickle import torch import gc import pandas as pd from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression # constants TURNS_PER_CONV = 8 MEASURES_PER_TURN = 10 PROJ_DIR = "/content/drive/My Drive/faculdade/fct-miei/04_ano4_(year4)/semestre1/ri/infos_projeto" def project(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, elastic, test_bed, topics, relevance_judgments, idx, setName, plots=False): # total _recall = 0 _p10 = 0 _ndcg5 = 0 # counters _ntopics = 0 _nturns = 0 _ntotalTurns = 0 # metrics _p10s = np.array([]) _aps = np.array([]) _ndcg5s = np.array([]) _precisions = np.array([]) _recalls = np.array([]) # conv and turns numbers and names _convNumbers = [] _convNames = np.array([]) for topic in topics: convID = topic['number'] if convID not in idx: continue _convNumbers.append(convID) _convNames = np.append(_convNames, str(convID) + " " + topic['title']) _turnPrecisions = np.array([]) _turnRecalls = np.array([]) for turn in topic['turn'][:TURNS_PER_CONV]: turnID = turn['number'] utterance = turn['raw_utterance'] topicTurnID = '%d_%d'% (convID, turnID) aux = relevance_judgments.loc[relevance_judgments['topic_turn_id'] == (topicTurnID)] num_rel = aux.loc[aux['rel'] != 0]['docid'].count() _convNames = np.append(_convNames, topicTurnID + " " + utterance) _ntotalTurns += 1 if num_rel == 0: _p10s = np.append(_p10s, np.nan) _aps = np.append(_aps, np.nan) _ndcg5s = np.append(_ndcg5s, np.nan) _turnPrecisions = np.append(_turnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _turnRecalls = np.append(_turnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) continue if UPDATE_ELASTICSEARCH_RESULTS: result = elastic.search_body(query=utterance, numDocs = REL_DOCS_PER_TURN) pickle.dump(result, open(PROJ_DIR + "/pkls/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "wb")) else: result = pickle.load(open(PROJ_DIR + "/pkls/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "rb")) if np.size(result) == 0 or num_rel == 0: _p10s = np.append(_p10s, 0.0) _aps = np.append(_aps, 0.0) _ndcg5s = np.append(_ndcg5s, 0.0) _turnPrecisions = np.append(_turnPrecisions, np.zeros(MEASURES_PER_TURN)) _turnRecalls = np.append(_turnRecalls, np.zeros(MEASURES_PER_TURN)) if plots: print(topicTurnID, utterance, num_rel, "NO RESULTS") continue else: if plots: print(topicTurnID, utterance, num_rel) [p10, recall, ap, ndcg5, precisions, recalls] = test_bed.eval(result[['_id','_score']], topicTurnID) # calculate conv precisions and recalls _turnPrecisions = np.append(_turnPrecisions, precisions) _turnRecalls = np.append(_turnRecalls, recalls) """plots.plotLines(PROJ_DIR, REL_DOCS_PER_TURN, "precision", [range(11)], [_turnPrecisions], ["LMD"]) plots.plotLines(PROJ_DIR, REL_DOCS_PER_TURN, "recall", [range(11)], [_turnRecalls], ["LMD"]) plots.plotLines(PROJ_DIR, REL_DOCS_PER_TURN, "precision-recall", [_turnRecalls], [_turnPrecisions], ["LMD"])""" if plots: print('P@10=', p10, ' Recall=', recall, ' AP=', ap, ' NDCG@5=',ndcg5) # total _recall = _recall + recall _p10 = _p10 + p10 _ndcg5 = _ndcg5 + ndcg5 # metrics _p10s = np.append(_p10s, p10) _aps = np.append(_aps, ap) _ndcg5s = np.append(_ndcg5s, ndcg5) # counters _nturns = _nturns + 1 while _ntotalTurns % TURNS_PER_CONV != 0: _ntotalTurns += 1 _p10s = np.append(_p10s, np.nan) _aps = np.append(_aps, np.nan) _ndcg5s = np.append(_ndcg5s, np.nan) _turnPrecisions = np.append(_turnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _turnRecalls = np.append(_turnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _convNames = np.append(_convNames, "NO RESULT") # compute conv means _turnPrecisions = np.reshape(_turnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _turnRecalls = np.reshape(_turnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) # add precisions and recalls to global matrix _precisions = np.append(_precisions, np.nanmean(_turnPrecisions, axis=0)) _recalls = np.append(_recalls, np.nanmean(_turnRecalls, axis=0)) # counters _ntopics += 1 # metrics _p10s = np.reshape(_p10s, (_ntopics, TURNS_PER_CONV)) _aps = np.reshape(_aps, (_ntopics, TURNS_PER_CONV)) _ndcg5s = np.reshape(_ndcg5s, (_ntopics, TURNS_PER_CONV)) _precisions = np.reshape(_precisions, (_ntopics, MEASURES_PER_TURN)) _recalls = np.reshape(_recalls, (_ntopics, MEASURES_PER_TURN)) # convs and turns names _convNames = np.reshape(_convNames, (_ntopics, TURNS_PER_CONV + 1)) # generate plots if plots: plots.plotMetricAlongConversation(PROJ_DIR, REL_DOCS_PER_TURN, setName, "Average Precision", [_aps], ["LMD"], _convNumbers) plots.plotMetricAlongConversation(PROJ_DIR, REL_DOCS_PER_TURN, setName, "normalized Discounted Cumulative Gain", [_ndcg5s], ["LMD"], _convNumbers) plots.plotMetricEachConversation(PROJ_DIR, REL_DOCS_PER_TURN, setName, "Average Precision", [_aps], ["LMD"], _convNumbers, _convNames) plots.plotMetricEachConversation(PROJ_DIR, REL_DOCS_PER_TURN, setName, "normalized Discounted Cumulative Gain", [_ndcg5s], ["LMD"], _convNumbers, _convNames) plots.plotPrecisionRecall(PROJ_DIR, REL_DOCS_PER_TURN, setName, [_recalls], [_precisions], ["LMD"], _convNumbers) # total mean _p10 = _p10/_nturns _recall = _recall/_nturns _ndcg5 = _ndcg5/_nturns if plots: print() print('P@10=', _p10, ' Recall=', _recall, ' NDCG@5=', _ndcg5) return _aps, _ndcg5s, _recalls, _precisions, _convNumbers, _convNames def project2Train(REL_DOCS_PER_TURN, UPDATE_BERT_RESULTS, tokenizer, model, device, topics, relevanceJudgments, topicsIDs, setName): # variables totalDocs = 0 triplets = np.array([]) features = np.array([]) classes = np.array([]) print("Train") print("-- Building triplets") for topic in topics: convID = topic['number'] if convID not in topicsIDs: continue for turn in topic['turn'][:TURNS_PER_CONV]: turnID = turn['number'] topicTurnID = '%d_%d'% (convID, turnID) info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] numberRel = info.loc[info['rel'] != 0]['docid'].count() if numberRel == 0: continue pickleFile = pickle.load(open(PROJ_DIR + "/pkls/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "rb")) if np.size(pickleFile) == 0: continue question = turn['raw_utterance'] for docID in info['docid']: docInfo = info.loc[info['docid'] == docID] passageList = pickleFile[pickleFile['_id'] == docID]['_source.body'].values if len(passageList) > 0: passage = passageList[0] classes = np.append(classes, (docInfo['rel'].values[0] > 0) * 1) triplets = np.append(triplets, np.array([str(question), str(passage)])) totalDocs += 1 triplets = np.reshape(triplets, (totalDocs, 2)) classes = np.reshape(classes, (totalDocs, )) print("-- Getting BERT embeddings") if UPDATE_BERT_RESULTS: for triplet in triplets: bertInput = convert_to_bert_input(sentences=triplet, max_seq_length=512, tokenizer=tokenizer, padding='max_length', truncation=False) bertInput['input_ids'] = bertInput['input_ids'].to(device) bertInput['attention_mask'] = bertInput['attention_mask'].to(device) bertInput['token_type_ids'] = bertInput['token_type_ids'].to(device) bertOutput = model(**bertInput) featuresLine = bertOutput["last_hidden_state"][0, 0].detach().clone().cpu().numpy() features = np.append(features, featuresLine) features = np.reshape(features, (totalDocs, 768)) pickle.dump(features, open(PROJ_DIR + "/bert/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/all.pkl", "wb")) else: features = pickle.load(open(PROJ_DIR + "/bert/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/all.pkl", "rb")) print("-- Training classifier") folds = 7 maxIter = 500 bestValidError = 1 bestC = -10 for C in range(-10, 1): trainError = validError = 0 for train, valid in StratifiedKFold(n_splits=folds).split(classes, classes): lr = LogisticRegression(C=10**C, random_state=0, max_iter=maxIter) lr.fit(features[train, :], classes[train]) trainError += 1 - lr.score(features[train, :], classes[train]) validError += 1 - lr.score(features[valid, :], classes[valid]) if validError / folds < bestValidError: bestValidError = validError / folds bestC = C classifier = LogisticRegression(random_state=0, max_iter=maxIter, C=10**bestC) classifier.fit(features, classes) print("-- Training done (C: 10^{} ; valid: {})".format(bestC, bestValidError)) gc.collect() return classifier def project2Test(REL_DOCS_PER_TURN, UPDATE_BERT_RESULTS, classifier, tokenizer, model, device, topics, test_bed, relevanceJudgments, topicsIDs, setName): # counters _ntopics = 0 _nturns = 0 _ntotalTurns = 0 # metrics _p10s = np.array([]) _aps = np.array([]) _ndcg5s = np.array([]) _precisions = np.array([]) _recalls = np.array([]) print("Test") for topic in topics: convID = topic['number'] if convID not in topicsIDs: continue _turnPrecisions = np.array([]) _turnRecalls = np.array([]) for turn in topic['turn'][:TURNS_PER_CONV]: # variables totalDocs = 0 triplets = np.array([]) features = np.array([]) pred = np.array([]) turnID = turn['number'] topicTurnID = '%d_%d'% (convID, turnID) info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] numberRel = info.loc[info['rel'] != 0]['docid'].count() _ntotalTurns += 1 if numberRel == 0: _p10s = np.append(_p10s, np.nan) _aps = np.append(_aps, np.nan) _ndcg5s = np.append(_ndcg5s, np.nan) _turnPrecisions = np.append(_turnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _turnRecalls = np.append(_turnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) continue pickleFile = pickle.load(open(PROJ_DIR + "/pkls/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "rb")) if np.size(pickleFile) == 0: _p10s = np.append(_p10s, 0.0) _aps = np.append(_aps, 0.0) _ndcg5s = np.append(_ndcg5s, 0.0) _turnPrecisions = np.append(_turnPrecisions, np.zeros(MEASURES_PER_TURN)) _turnRecalls = np.append(_turnRecalls, np.zeros(MEASURES_PER_TURN)) continue print("-- {}".format(topicTurnID)) print("-- -- Building triplets") """question = turn['raw_utterance'] for docID in pickleFile['_id']: docInfo = info.loc[info['docid'] == docID] passageList = pickleFile[pickleFile['_id'] == docID]['_source.body'].values if len(passageList) > 0: passage = passageList[0] triplets = np.append(triplets, np.array([str(question), str(passage), str(docID)])) rel = 0 if len(docInfo['rel'].values) > 0: rel = docInfo['rel'].values[0] pred = np.append(pred, rel) totalDocs += 1 triplets = np.reshape(triplets, (totalDocs, 3)) pred = np.reshape(pred, (totalDocs, ))""" print("-- -- Getting BERT embeddings") if UPDATE_BERT_RESULTS: passages = [] for docID in pickleFile['_id']: docInfo = info.loc[info['docid'] == docID] passages.append(pickleFile[pickleFile['_id'] == docID]['_source.body'].values[0]) for passage in passages: bertInput = convert_to_bert_input(sentences=[question, passage], max_seq_length=512, tokenizer=tokenizer, padding='max_length', truncation=False) bertInput['input_ids'] = bertInput['input_ids'].to(device) bertInput['attention_mask'] = bertInput['attention_mask'].to(device) bertInput['token_type_ids'] = bertInput['token_type_ids'].to(device) bertOutput = model(**bertInput) featuresLine = bertOutput["last_hidden_state"][0, 0].detach().clone().cpu().numpy() features = np.append(features, featuresLine) features = np.reshape(features, (totalDocs, 768)) pickle.dump(features, open(PROJ_DIR + "/bert/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "wb")) else: features = pickle.load(open(PROJ_DIR + "/bert/" + setName + "/" + str(REL_DOCS_PER_TURN) + "/" + topicTurnID + ".pkl", "rb")) print("-- -- Testing classifier") #pred = classifier.predict(features) prob = classifier.predict_proba(features)[:, 1] #if len(pred[pred > 0]) > 0: print("-- -- Sorting passages") newOrder = np.lexsort(np.reshape(prob, (1, prob.shape[0])))[::-1] ##triplets = triplets[newOrder, :] ##pred = pred[newOrder] triplets = np.column_stack((pickleFile['_id'], prob)) triplets = triplets[newOrder, :] triplets = pd.DataFrame(data=triplets, columns=['_id', '_score']) print("-- -- Plotting") # 0 question ; 1 passage ; 2 docid | predictions | probabilities ##[p10, recall, ap, ndcg5, precisions, recalls] = test_bed.eval2(triplets, pred, prob) [p10, recall, ap, ndcg5, precisions, recalls] = test_bed.eval(triplets, topicTurnID) # calculate conv precisions and recalls _turnPrecisions = np.append(_turnPrecisions, precisions) _turnRecalls = np.append(_turnRecalls, recalls) # metrics _p10s = np.append(_p10s, p10) _aps = np.append(_aps, ap) _ndcg5s = np.append(_ndcg5s, ndcg5) # counters _nturns += 1 """else: print("-- -- Sorting passages") print("-- -- Plotting") _p10s = np.append(_p10s, np.nan) _aps = np.append(_aps, np.nan) _ndcg5s = np.append(_ndcg5s, np.nan) _turnPrecisions = np.append(_turnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _turnRecalls = np.append(_turnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) """ while _ntotalTurns % TURNS_PER_CONV != 0: _ntotalTurns += 1 _p10s = np.append(_p10s, np.nan) _aps = np.append(_aps, np.nan) _ndcg5s = np.append(_ndcg5s, np.nan) _turnPrecisions = np.append(_turnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _turnRecalls = np.append(_turnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) # compute conv means _turnPrecisions = np.reshape(_turnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _turnRecalls = np.reshape(_turnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) # add precisions and recalls to global matrix _precisions = np.append(_precisions, np.nanmean(_turnPrecisions, axis=0)) _recalls = np.append(_recalls, np.nanmean(_turnRecalls, axis=0)) # counters _ntopics += 1 gc.collect() # metrics _p10s = np.reshape(_p10s, (_ntopics, TURNS_PER_CONV)) _aps = np.reshape(_aps, (_ntopics, TURNS_PER_CONV)) _ndcg5s = np.reshape(_ndcg5s, (_ntopics, TURNS_PER_CONV)) _precisions = np.reshape(_precisions, (_ntopics, MEASURES_PER_TURN)) _recalls = np.reshape(_recalls, (_ntopics, MEASURES_PER_TURN)) return _aps, _ndcg5s, _recalls, _precisions def project3(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames): ## METHOD 1 method1Metrics = project3Method1(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames) APs = [method1Metrics["aps"]["lmd"], method1Metrics["aps"]["bert"]] nDCGs = [method1Metrics["ndcg5s"]["lmd"], method1Metrics["ndcg5s"]["bert"]] Recalls = [method1Metrics["recalls"]["lmd"], method1Metrics["recalls"]["bert"]] Precisions = [method1Metrics["precisions"]["lmd"], method1Metrics["precisions"]["bert"]] doPlots(REL_DOCS_PER_TURN, setName, "Phase 3 Method 1", APs, nDCGs, Recalls, Precisions, ["LMD", "BERT"], convNumbers, convNames) ## METHOD 2 method2Metrics = project3Method2(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames) APs = [method2Metrics["aps"]["lmd"], method2Metrics["aps"]["bert"]] nDCGs = [method2Metrics["ndcg5s"]["lmd"], method2Metrics["ndcg5s"]["bert"]] Recalls = [method2Metrics["recalls"]["lmd"], method2Metrics["recalls"]["bert"]] Precisions = [method2Metrics["precisions"]["lmd"], method2Metrics["precisions"]["bert"]] doPlots(REL_DOCS_PER_TURN, setName, "Phase 3 Method 2", APs, nDCGs, Recalls, Precisions, ["LMD", "BERT"], convNumbers, convNames) ## METHOD 3 method3Metrics = project3Method3(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames) APs = [method3Metrics["aps"]["lmd"], method3Metrics["aps"]["bert"]] nDCGs = [method3Metrics["ndcg5s"]["lmd"], method3Metrics["ndcg5s"]["bert"]] Recalls = [method3Metrics["recalls"]["lmd"], method3Metrics["recalls"]["bert"]] Precisions = [method3Metrics["precisions"]["lmd"], method3Metrics["precisions"]["bert"]] doPlots(REL_DOCS_PER_TURN, setName, "Phase 3 Method 3", APs, nDCGs, Recalls, Precisions, ["LMD", "BERT"], convNumbers, convNames) # combined APs = [method1Metrics["aps"]["lmd"], method2Metrics["aps"]["lmd"], method3Metrics["aps"]["lmd"]] nDCGs = [method1Metrics["ndcg5s"]["lmd"], method2Metrics["ndcg5s"]["lmd"], method3Metrics["ndcg5s"]["lmd"]] Recalls = [method1Metrics["recalls"]["lmd"], method2Metrics["recalls"]["lmd"], method3Metrics["recalls"]["lmd"]] Precisions = [method1Metrics["precisions"]["lmd"], method2Metrics["precisions"]["lmd"], method3Metrics["precisions"]["lmd"]] doPlots(REL_DOCS_PER_TURN, setName, "Phase 3 LMD", APs, nDCGs, Recalls, Precisions, ["Method 1", "Method 2", "Method 3"], convNumbers, convNames) APs = [method1Metrics["aps"]["bert"], method2Metrics["aps"]["bert"], method3Metrics["aps"]["bert"]] nDCGs = [method1Metrics["ndcg5s"]["bert"], method2Metrics["ndcg5s"]["bert"], method3Metrics["ndcg5s"]["bert"]] Recalls = [method1Metrics["recalls"]["bert"], method2Metrics["recalls"]["bert"], method3Metrics["recalls"]["bert"]] Precisions = [method1Metrics["precisions"]["bert"], method2Metrics["precisions"]["bert"], method3Metrics["precisions"]["bert"]] doPlots(REL_DOCS_PER_TURN, setName, "Phase 3 BERT", APs, nDCGs, Recalls, Precisions, ["Method 1", "Method 2", "Method 3"], convNumbers, convNames) def project3Method1(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames): ## METHOD 1 print("Method 1") # counters _LMDntopics = 0 _LMDnturns = 0 _LMDntotalTurns = 0 _BERTntopics = 0 _BERTnturns = 0 _BERTntotalTurns = 0 # metrics _LMDp10s = np.array([]) _LMDaps = np.array([]) _LMDndcg5s = np.array([]) _LMDprecisions = np.array([]) _LMDrecalls = np.array([]) _BERTp10s = np.array([]) _BERTaps = np.array([]) _BERTndcg5s = np.array([]) _BERTprecisions = np.array([]) _BERTrecalls = np.array([]) for topic in topics: convID = topic['number'] if convID not in topicsIDs: continue _LMDturnPrecisions = np.array([]) _LMDturnRecalls = np.array([]) _BERTturnPrecisions = np.array([]) _BERTturnRecalls = np.array([]) print("-- {}".format(convID)) firstUtterance = "" for turn in topic['turn'][:TURNS_PER_CONV]: turnID = turn['number'] topicTurnID = '%d_%d'% (convID, turnID) info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] numberRel = info.loc[info['rel'] != 0]['docid'].count() _LMDntotalTurns += 1 if numberRel == 0: _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) continue utterance = turn['raw_utterance'] if turnID == 1: firstUtterance = utterance else: utterance += " " + firstUtterance print("-- -- {}: {}".format(turnID, utterance)) # LMD [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments) _LMDp10s = np.append(_LMDp10s, p10) _LMDaps = np.append(_LMDaps, ap) _LMDndcg5s = np.append(_LMDndcg5s, ndcg5) _LMDturnPrecisions = np.append(_LMDturnPrecisions, precisions) _LMDturnRecalls = np.append(_LMDturnRecalls, recalls) _LMDnturns += 1 # BERT [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments, True) _BERTp10s = np.append(_BERTp10s, p10) _BERTaps = np.append(_BERTaps, ap) _BERTndcg5s = np.append(_BERTndcg5s, ndcg5) _BERTturnPrecisions = np.append(_BERTturnPrecisions, precisions) _BERTturnRecalls = np.append(_BERTturnRecalls, recalls) _BERTnturns += 1 while _LMDntotalTurns % TURNS_PER_CONV != 0: _LMDntotalTurns += 1 _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTntotalTurns += 1 _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) # compute conv means _LMDturnPrecisions = np.reshape(_LMDturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _LMDturnRecalls = np.reshape(_LMDturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnPrecisions = np.reshape(_BERTturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnRecalls = np.reshape(_BERTturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) # add precisions and recalls to global matrix _LMDprecisions = np.append(_LMDprecisions, np.nanmean(_LMDturnPrecisions, axis=0)) _LMDrecalls = np.append(_LMDrecalls, np.nanmean(_LMDturnRecalls, axis=0)) _BERTprecisions = np.append(_BERTprecisions, np.nanmean(_BERTturnPrecisions, axis=0)) _BERTrecalls = np.append(_BERTrecalls, np.nanmean(_BERTturnRecalls, axis=0)) # counters _LMDntopics += 1 _BERTntopics += 1 # reshape _LMDp10s = np.reshape(_LMDp10s, (_LMDntopics, TURNS_PER_CONV)) _LMDaps = np.reshape(_LMDaps, (_LMDntopics, TURNS_PER_CONV)) _LMDndcg5s = np.reshape(_LMDndcg5s, (_LMDntopics, TURNS_PER_CONV)) _LMDprecisions = np.reshape(_LMDprecisions, (_LMDntopics, MEASURES_PER_TURN)) _LMDrecalls = np.reshape(_LMDrecalls, (_LMDntopics, MEASURES_PER_TURN)) _BERTp10s = np.reshape(_BERTp10s, (_BERTntopics, TURNS_PER_CONV)) _BERTaps = np.reshape(_BERTaps, (_BERTntopics, TURNS_PER_CONV)) _BERTndcg5s = np.reshape(_BERTndcg5s, (_BERTntopics, TURNS_PER_CONV)) _BERTprecisions = np.reshape(_BERTprecisions, (_BERTntopics, MEASURES_PER_TURN)) _BERTrecalls = np.reshape(_BERTrecalls, (_BERTntopics, MEASURES_PER_TURN)) method1Metrics = { "aps": { "lmd": _LMDaps, "bert": _BERTaps }, "ndcg5s": { "lmd": _LMDndcg5s, "bert": _BERTndcg5s }, "recalls": { "lmd": _LMDrecalls, "bert": _BERTrecalls }, "precisions": { "lmd": _LMDprecisions, "bert": _BERTprecisions } } return method1Metrics def project3Method2(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames): ## METHOD 2 print("Method 2") # counters _LMDntopics = 0 _LMDnturns = 0 _LMDntotalTurns = 0 _BERTntopics = 0 _BERTnturns = 0 _BERTntotalTurns = 0 # metrics _LMDp10s = np.array([]) _LMDaps = np.array([]) _LMDndcg5s = np.array([]) _LMDprecisions = np.array([]) _LMDrecalls = np.array([]) _BERTp10s = np.array([]) _BERTaps = np.array([]) _BERTndcg5s = np.array([]) _BERTprecisions = np.array([]) _BERTrecalls = np.array([]) for topic in topics: convID = topic['number'] if convID not in topicsIDs: continue _LMDturnPrecisions = np.array([]) _LMDturnRecalls = np.array([]) _BERTturnPrecisions = np.array([]) _BERTturnRecalls = np.array([]) print("-- {}".format(convID)) firstEntities = [] for turn in topic['turn'][:TURNS_PER_CONV]: turnID = turn['number'] topicTurnID = '%d_%d'% (convID, turnID) info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] numberRel = info.loc[info['rel'] != 0]['docid'].count() _LMDntotalTurns += 1 if numberRel == 0: _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) continue utterance = turn['raw_utterance'] if turnID == 1: """for entity in nlp(utterance).ents: firstEntities += entity.text + " " utterance += " " + firstEntities""" firstEntities = [str(ent) for ent in nlp(utterance).ents] print("-- -- {}: {} ; {}".format(turnID, utterance, firstEntities)) # LMD [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsEntities(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, firstEntities, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments) _LMDp10s = np.append(_LMDp10s, p10) _LMDaps = np.append(_LMDaps, ap) _LMDndcg5s = np.append(_LMDndcg5s, ndcg5) _LMDturnPrecisions = np.append(_LMDturnPrecisions, precisions) _LMDturnRecalls = np.append(_LMDturnRecalls, recalls) _LMDnturns += 1 # BERT [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsEntities(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, firstEntities, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments, True) _BERTp10s = np.append(_BERTp10s, p10) _BERTaps = np.append(_BERTaps, ap) _BERTndcg5s = np.append(_BERTndcg5s, ndcg5) _BERTturnPrecisions = np.append(_BERTturnPrecisions, precisions) _BERTturnRecalls = np.append(_BERTturnRecalls, recalls) _BERTnturns += 1 while _LMDntotalTurns % TURNS_PER_CONV != 0: _LMDntotalTurns += 1 _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTntotalTurns += 1 _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) # compute conv means _LMDturnPrecisions = np.reshape(_LMDturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _LMDturnRecalls = np.reshape(_LMDturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnPrecisions = np.reshape(_BERTturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnRecalls = np.reshape(_BERTturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) # add precisions and recalls to global matrix _LMDprecisions = np.append(_LMDprecisions, np.nanmean(_LMDturnPrecisions, axis=0)) _LMDrecalls = np.append(_LMDrecalls, np.nanmean(_LMDturnRecalls, axis=0)) _BERTprecisions = np.append(_BERTprecisions, np.nanmean(_BERTturnPrecisions, axis=0)) _BERTrecalls = np.append(_BERTrecalls, np.nanmean(_BERTturnRecalls, axis=0)) # counters _LMDntopics += 1 _BERTntopics += 1 # reshape _LMDp10s = np.reshape(_LMDp10s, (_LMDntopics, TURNS_PER_CONV)) _LMDaps = np.reshape(_LMDaps, (_LMDntopics, TURNS_PER_CONV)) _LMDndcg5s = np.reshape(_LMDndcg5s, (_LMDntopics, TURNS_PER_CONV)) _LMDprecisions = np.reshape(_LMDprecisions, (_LMDntopics, MEASURES_PER_TURN)) _LMDrecalls = np.reshape(_LMDrecalls, (_LMDntopics, MEASURES_PER_TURN)) _BERTp10s = np.reshape(_BERTp10s, (_BERTntopics, TURNS_PER_CONV)) _BERTaps = np.reshape(_BERTaps, (_BERTntopics, TURNS_PER_CONV)) _BERTndcg5s = np.reshape(_BERTndcg5s, (_BERTntopics, TURNS_PER_CONV)) _BERTprecisions = np.reshape(_BERTprecisions, (_BERTntopics, MEASURES_PER_TURN)) _BERTrecalls = np.reshape(_BERTrecalls, (_BERTntopics, MEASURES_PER_TURN)) method2Metrics = { "aps": { "lmd": _LMDaps, "bert": _BERTaps }, "ndcg5s": { "lmd": _LMDndcg5s, "bert": _BERTndcg5s }, "recalls": { "lmd": _LMDrecalls, "bert": _BERTrecalls }, "precisions": { "lmd": _LMDprecisions, "bert": _BERTprecisions } } return method2Metrics def project3Method3(REL_DOCS_PER_TURN, UPDATE_ELASTICSEARCH_RESULTS, rewriter, tokenizer, model, device, nlp, classifier, elastic, testBed, topics, relevanceJudgments, topicsIDs, setName, convNumbers, convNames): ## METHOD 3 print("Method 3") # counters _LMDntopics = 0 _LMDnturns = 0 _LMDntotalTurns = 0 _BERTntopics = 0 _BERTnturns = 0 _BERTntotalTurns = 0 # metrics _LMDp10s = np.array([]) _LMDaps = np.array([]) _LMDndcg5s = np.array([]) _LMDprecisions = np.array([]) _LMDrecalls = np.array([]) _BERTp10s = np.array([]) _BERTaps = np.array([]) _BERTndcg5s = np.array([]) _BERTprecisions = np.array([]) _BERTrecalls = np.array([]) for topic in topics: convID = topic['number'] if convID not in topicsIDs: continue _LMDturnPrecisions = np.array([]) _LMDturnRecalls = np.array([]) _BERTturnPrecisions = np.array([]) _BERTturnRecalls = np.array([]) print("-- {}".format(convID)) convUterrances = [] for turn in topic['turn'][:TURNS_PER_CONV]: turnID = turn['number'] topicTurnID = '%d_%d'% (convID, turnID) info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] numberRel = info.loc[info['rel'] != 0]['docid'].count() _LMDntotalTurns += 1 if numberRel == 0: _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) continue utterance = rewriter.rewrite_query_with_T5(turn['raw_utterance'], convUterrances[:len(convUterrances)]) convUterrances.append(turn['raw_utterance']) print("-- -- {}: {}".format(turnID, utterance)) # LMD [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments) _LMDp10s = np.append(_LMDp10s, p10) _LMDaps = np.append(_LMDaps, ap) _LMDndcg5s = np.append(_LMDndcg5s, ndcg5) _LMDturnPrecisions = np.append(_LMDturnPrecisions, precisions) _LMDturnRecalls = np.append(_LMDturnRecalls, recalls) _LMDnturns += 1 # BERT [p10, recall, ap, ndcg5, precisions, recalls] = getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, utterance, utterance, REL_DOCS_PER_TURN, topicTurnID, relevanceJudgments, True) _BERTp10s = np.append(_BERTp10s, p10) _BERTaps = np.append(_BERTaps, ap) _BERTndcg5s = np.append(_BERTndcg5s, ndcg5) _BERTturnPrecisions = np.append(_BERTturnPrecisions, precisions) _BERTturnRecalls = np.append(_BERTturnRecalls, recalls) _BERTnturns += 1 while _LMDntotalTurns % TURNS_PER_CONV != 0: _LMDntotalTurns += 1 _LMDp10s = np.append(_LMDp10s, np.nan) _LMDaps = np.append(_LMDaps, np.nan) _LMDndcg5s = np.append(_LMDndcg5s, np.nan) _LMDturnPrecisions = np.append(_LMDturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _LMDturnRecalls = np.append(_LMDturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) _BERTntotalTurns += 1 _BERTp10s = np.append(_BERTp10s, np.nan) _BERTaps = np.append(_BERTaps, np.nan) _BERTndcg5s = np.append(_BERTndcg5s, np.nan) _BERTturnPrecisions = np.append(_BERTturnPrecisions, np.ones(MEASURES_PER_TURN) * np.nan) _BERTturnRecalls = np.append(_BERTturnRecalls, np.ones(MEASURES_PER_TURN) * np.nan) # compute conv means _LMDturnPrecisions = np.reshape(_LMDturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _LMDturnRecalls = np.reshape(_LMDturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnPrecisions = np.reshape(_BERTturnPrecisions, (TURNS_PER_CONV, MEASURES_PER_TURN)) _BERTturnRecalls = np.reshape(_BERTturnRecalls, (TURNS_PER_CONV, MEASURES_PER_TURN)) # add precisions and recalls to global matrix _LMDprecisions = np.append(_LMDprecisions, np.nanmean(_LMDturnPrecisions, axis=0)) _LMDrecalls = np.append(_LMDrecalls, np.nanmean(_LMDturnRecalls, axis=0)) _BERTprecisions = np.append(_BERTprecisions, np.nanmean(_BERTturnPrecisions, axis=0)) _BERTrecalls = np.append(_BERTrecalls, np.nanmean(_BERTturnRecalls, axis=0)) # counters _LMDntopics += 1 _BERTntopics += 1 # reshape _LMDp10s = np.reshape(_LMDp10s, (_LMDntopics, TURNS_PER_CONV)) _LMDaps = np.reshape(_LMDaps, (_LMDntopics, TURNS_PER_CONV)) _LMDndcg5s = np.reshape(_LMDndcg5s, (_LMDntopics, TURNS_PER_CONV)) _LMDprecisions = np.reshape(_LMDprecisions, (_LMDntopics, MEASURES_PER_TURN)) _LMDrecalls = np.reshape(_LMDrecalls, (_LMDntopics, MEASURES_PER_TURN)) _BERTp10s = np.reshape(_BERTp10s, (_BERTntopics, TURNS_PER_CONV)) _BERTaps = np.reshape(_BERTaps, (_BERTntopics, TURNS_PER_CONV)) _BERTndcg5s = np.reshape(_BERTndcg5s, (_BERTntopics, TURNS_PER_CONV)) _BERTprecisions = np.reshape(_BERTprecisions, (_BERTntopics, MEASURES_PER_TURN)) _BERTrecalls = np.reshape(_BERTrecalls, (_BERTntopics, MEASURES_PER_TURN)) method3Metrics = { "aps": { "lmd": _LMDaps, "bert": _BERTaps }, "ndcg5s": { "lmd": _LMDndcg5s, "bert": _BERTndcg5s }, "recalls": { "lmd": _LMDrecalls, "bert": _BERTrecalls }, "precisions": { "lmd": _LMDprecisions, "bert": _BERTprecisions } } return method3Metrics def getMetrics(testBed, result, topicTurnID): if np.size(result) == 0: return [0, 0, 0, 0, np.zeros(MEASURES_PER_TURN), np.zeros(MEASURES_PER_TURN)] [p10, recall, ap, ndcg5, precisions, recalls] = testBed.eval(result[['_id','_score']], topicTurnID) return [p10, recall, ap, ndcg5, precisions, recalls] def getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, question, utterance, numDocs, topicTurnID, relevanceJudgments, runBERT=False): result = elastic.search_body(query=utterance, numDocs=numDocs) if runBERT: result = getBERTResult(classifier, tokenizer, model, device, question, result, topicTurnID, relevanceJudgments) return getMetrics(testBed, result, topicTurnID) def getMetricsEntities(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, question, utterance, entities, numDocs, topicTurnID, relevanceJudgments, runBERT=False): if len(entities) == 0: return getMetricsNormal(MEASURES_PER_TURN, classifier, tokenizer, model, device, elastic, testBed, question, utterance, numDocs, topicTurnID, relevanceJudgments, runBERT) result = elastic.search_with_boosted_entities(utterance, entities, np.ones(len(entities)), numDocs) if runBERT: result = getBERTResult(classifier, tokenizer, model, device, question, result, topicTurnID, relevanceJudgments) return getMetrics(testBed, result, topicTurnID) def getBERTResult(classifier, tokenizer, model, device, question, pickleFile, topicTurnID, relevanceJudgments): info = relevanceJudgments.loc[relevanceJudgments['topic_turn_id'] == (topicTurnID)] passages = [] for docID in pickleFile['_id']: docInfo = info.loc[info['docid'] == docID] passages.append(pickleFile[pickleFile['_id'] == docID]['_source.body'].values[0]) features = np.array([]) for passage in passages: bertInput = convert_to_bert_input(sentences=[question, passage], max_seq_length=512, tokenizer=tokenizer, padding='max_length', truncation=False) bertInput['input_ids'] = bertInput['input_ids'].to(device) bertInput['attention_mask'] = bertInput['attention_mask'].to(device) bertInput['token_type_ids'] = bertInput['token_type_ids'].to(device) bertOutput = model(**bertInput) featuresLine = bertOutput["last_hidden_state"][0, 0].detach().clone().cpu().numpy() features = np.append(features, featuresLine) features = np.reshape(features, (len(passages), 768)) prob = classifier.predict_proba(features)[:, 1] newOrder = np.lexsort(np.reshape(prob, (1, prob.shape[0])))[::-1] result = np.column_stack((pickleFile['_id'], prob)) result = result[newOrder, :] result = pd.DataFrame(data=result, columns=['_id', '_score']) gc.collect() return result def doPlots(REL_DOCS_PER_TURN, SET_NAME, preName, APs, nDCGs, Recalls, Precisions, methods, convNumbers, convNames): plots.plotMetricAlongConversation(PROJ_DIR, REL_DOCS_PER_TURN, SET_NAME, "Average Precision", APs, methods, convNumbers, preName + " - ") plots.plotMetricAlongConversation(PROJ_DIR, REL_DOCS_PER_TURN, SET_NAME, "normalized Discounted Cumulative Gain", nDCGs, methods, convNumbers, preName + " - ") plots.plotMetricEachConversation(PROJ_DIR, REL_DOCS_PER_TURN, SET_NAME, preName + " - Average Precision", APs, methods, convNumbers, convNames) plots.plotMetricEachConversation(PROJ_DIR, REL_DOCS_PER_TURN, SET_NAME, preName + " - normalized Discounted Cumulative Gain", nDCGs, methods, convNumbers, convNames) plots.plotPrecisionRecall(PROJ_DIR, REL_DOCS_PER_TURN, SET_NAME, Recalls, Precisions, methods, convNumbers, preName + " - ") """question = turn['raw_utterance'] for docID in info['docid']: docInfo = info.loc[info['docid'] == docID] passageList = pickleFile[pickleFile['_id'] == docID]['_source.body'].values if len(passageList) > 0: passage = passageList[0] rel = (docInfo['rel'].values[0] > 0) * 1 triplets = np.append(triplets, np.array([str(question), str(passage), rel])) totalTurns += 1 triplets = np.reshape(triplets, (totalTurns, 3)) print("Getting BERT embeddings") for triplet in triplets: bertInput = convert_to_bert_input(sentences=triplet[:-1], max_seq_length=512, tokenizer=tokenizer, padding='max_length', truncation=False) bertInput['input_ids'] = bertInput['input_ids'].to(device) bertInput['attention_mask'] = bertInput['attention_mask'].to(device) bertInput['token_type_ids'] = bertInput['token_type_ids'].to(device) bertOutput = model(**bertInput) featuresLine = bertOutput["last_hidden_state"][0, 0].detach().clone().cpu().numpy() features = np.append(features, featuresLine) features = np.reshape(features, (totalTurns, 768)) print("Training classifier") classifier.fit(features, triplets[:, 2]) print("Training done")""" """for docID in info['docid']: if docID[:3] != "CAR": print(docID) print(elastic.get_doc_body('MARCO_955948')) print(info['docid']) print()""" #passages = pickleFile['_source']['body'] #relevances = info['rel'] #print("passages:\n{}\nrel\n{}\n\n".format(topicTurnID, passage, relevance)) def convert_to_bert_input(sentences=None, max_seq_length=512, tokenizer=None, add_cls=True, padding='do_not_pad', truncation=False): """Receive a list of [query_text, doc_text] in variable sentences Returns a dictionary of tensors It can only receive a single pair [query_text, doc_text] each time """ return tokenizer.encode_plus(sentences, add_special_tokens=add_cls, padding=padding, max_length=max_seq_length, truncation=truncation, return_tensors='pt', return_token_type_ids=True, return_attention_mask=True)