-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontext_precision.py
More file actions
109 lines (88 loc) · 4.19 KB
/
context_precision.py
File metadata and controls
109 lines (88 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from polyeval.utils.model import ModelCaller
from typing import Any, Callable, Dict
from datasets import Dataset
import logging
import json
logging.basicConfig(level=logging.INFO)
class ContextPrecision:
name = 'context_precision'
PROMPTS = {
'en': {
'CONTEXT_PRECISION_TEMPLATE': """\
Given a question and a context, verify if the information in the given context is useful in answering the question. Return a Yes/No answer.
question: {question}
context:
{context}
answer:
""",
'REASONING_MESSAGE': """
My reasoning process is as follows:
I will go through each context, verify whether the information in the given context helps answer the question, and return a "Yes" or "No" answer, which I will convert to True or False accordingly.
Here are my judgments for each context:
{judges}
Then, by calculating the cumulative precision of all the correct responses (True) up to that point, the average precision score is: {result}
"""
},
'zh': {
'CONTEXT_PRECISION_TEMPLATE': """\
给定一个问题和一个上下文,验证给定的上下文中的信息是否有助于回答这个问题。返回一个“Yes”或“No”的答案。
问题:{question}
上下文:
{context}
答案:
""",
'REASONING_MESSAGE': """
我的推理过程如下:
我会遍历每个上下文信息,验证给定的上下文中的信息是否有助于回答这个问题。返回一个“Yes”或“No”的答案,并将其转化为True或False的相应。
以下依次是我对每个上下文的判断:
{judges}
然后,通过计算排在前面的所有正确响应(True)的累计精度的平均值,得出的平均精度得分是:{result}
"""
}
}
def __init__(self, lang='zh'):
self.language = lang
def get_prompt(self, key):
return self.PROMPTS[self.language][key]
def eval(self, dataset: Dataset, **kwargs):
if dataset is None:
return False, "No dataset provided"
sample_kwargs = kwargs.get('llm', None)
if not sample_kwargs:
return False, "No sampling parameters provided"
question = dataset["question"]
context = dataset["context"]
"""
Compute context precision based on the question and context.
"""
determine_results, determine_responses = self.determine_usefulness(question, context, sample_kwargs)
result_score = self.calculate_average_precision(determine_results)
string_results = [str(result) for result in determine_results]
judges = ",".join(string_results)
reasoning = self.get_prompt('REASONING_MESSAGE').format(judges=judges, result=result_score)
eval_result = {"score": result_score, "reasoning": reasoning, "responses": determine_responses}
return eval_result
def determine_usefulness(self, question: str, contexts: list[str], sample_kwargs: dict = None) -> list[bool]:
determine_responses = []
determine_results = []
for context in contexts:
determine_prompt = self.get_prompt('CONTEXT_PRECISION_TEMPLATE').format(question=question, context=context)
if isinstance(determine_prompt, str):
messages = [{"role": "user", "content": determine_prompt}]
determine_response, elapsed_time = ModelCaller.call(self, messages=messages, **sample_kwargs)
determine_result = ModelCaller.parse_response(self, determine_response)
determine_response.elapsed_time = elapsed_time
determine_response.prompt = json.dumps(determine_prompt)
determine_response.completion = determine_result
determine_results.append("Yes" in determine_result)
determine_responses.append(determine_response)
return determine_results, determine_responses
def calculate_average_precision(self, responses: list[bool]) -> float:
if not responses:
return 0.0
numerator, denominator = 0, 0
for i, resp in enumerate(responses, start=1):
if resp:
numerator += sum(responses[:i]) / i
denominator += 1
return numerator / denominator if denominator > 0 else 0.0