-
Notifications
You must be signed in to change notification settings - Fork 5
/
model_evaluation.py
102 lines (79 loc) · 3.41 KB
/
model_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
from typing import Callable
import sys
DISABLE_TEST_COLUMN = 'disable'
ANALYSIS_PATH = '/Users/idan/src/analysis_utils'
sys.path.append(ANALYSIS_PATH)
from confusion_matrix import ConfusionMatrix
def classifiy_commits_df(df
, classification_column
, classification_function
, text_name: str ='message'):
df[classification_column] = df[text_name].map(lambda x: classification_function(x) > 0)
return df
def evaluate_performance(df
, classification_column
, concept_column
, text_name: str ='message'):
if DISABLE_TEST_COLUMN in df.columns:
df = df[(df[DISABLE_TEST_COLUMN] != 1)]
g = df.groupby(
[classification_column, concept_column]
, as_index=False).agg({text_name : 'count'})
cm = ConfusionMatrix(g_df=g
,classifier=classification_column
,concept=concept_column
,count=text_name)
return cm.summarize()
def evaluate_regex_results(labels_file
, classification_column
, classification_function
, concept_column
):
df = pd.read_csv(labels_file
, engine='python')
df = classifiy_commits_df(df
, classification_column
, classification_function
)
df.to_csv(labels_file
, index=False)
return evaluate_performance(df
, classification_column
, concept_column)
def evaluate_regex_results_on_df(df: pd.DataFrame
, classification_column: str
, classification_function: Callable
, concept_column: str
, text_name: str ='message'
):
df = classifiy_commits_df(df
, classification_column
, classification_function
, text_name=text_name
)
return evaluate_performance(df
, classification_column
, concept_column
, text_name=text_name)
def evaluate_concept_classifier(concept
, text_name
, classification_function
, samples_file
, classification_column: str = None
, concept_column: str = None):
if not classification_column:
classification_column = concept + '_pred'
if not concept_column:
concept_column = 'Is_' + concept
df = pd.read_csv(samples_file)
df = df[~df[concept_column].isna()]
cm = evaluate_regex_results_on_df(df=df
, classification_column=classification_column
, classification_function=classification_function
, concept_column=concept_column
, text_name=text_name
)
print(concept + " CM")
print(cm)
return cm