1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-07-04T17:20:42.362428Z",
"start_time": "2025-07-04T17:20:42.350473Z"
}
},
"source": [
"import json\n",
"\n",
"import numpy as np\n",
"from tabulate import tabulate\n",
"\n",
"with open(\"./../data/test_performance_output.json\", \"r\") as f:\n",
" data = json.load(f)\n",
"\n",
"table = []\n",
"\n",
"print(f\"Num test results {len(data)}\")\n",
"\n",
"for test in data:\n",
" # print(f\"Test: {test['input']}\")\n",
" result = test['result']\n",
" num_expected = len(test['expected'])\n",
" num_terms = len(result[\"terms\"])\n",
" num_definitions = len([definition for term in result[\"terms\"] for definition in term[\"definitions\"]])\n",
"\n",
" true_expected = test['trueExpected']\n",
" allowed_expected = test['allowedExpected']\n",
" allowed_unexpected = test['allowedUnexpected']\n",
" false_observed = test['falseObserved']\n",
" false_definitions = test['falseDefinitions']\n",
"\n",
" table.append([\n",
" 1 if num_expected == 0 else true_expected / num_expected,\n",
" 1 if num_expected == 0 else allowed_expected / num_expected,\n",
" 1 if num_terms == 0 else (num_terms - false_observed) / num_terms,\n",
" 1 if num_definitions == 0 else (num_definitions - false_definitions) / num_definitions\n",
" ])\n",
"\n",
"print(tabulate(table, headers=[\n",
" \"Recall (Strict)\",\n",
" \"Recall\",\n",
" \"Precision\",\n",
" \"Gen. Def. Validity\"\n",
"]))\n",
"\n",
"print(\"\\n#### Averages ####\\n\")\n",
"\n",
"print(tabulate([np.average(np.array(table), axis=0)], headers=[\n",
" \"Recall (Strict)\",\n",
" \"Recall\",\n",
" \"Precision\",\n",
" \"Gen. Def. Validity\"\n",
"]))\n",
"\n",
"# data"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num test results 21\n",
" Recall (Strict) Recall Precision Gen. Def. Validity\n",
"----------------- -------- ----------- --------------------\n",
" 0.666667 1 1 1\n",
" 1 1 0.8 0.25\n",
" 0.666667 1 0.75 1\n",
" 1 1 0.6 1\n",
" 1 1 0.8 0.333333\n",
" 0.714286 0.857143 1 0.5\n",
" 0.8 1 1 0.5\n",
" 1 1 0 1\n",
" 1 1 1 1\n",
" 1 1 0.6 1\n",
" 1 1 0.428571 1\n",
" 1 1 1 1\n",
" 1 1 1 1\n",
" 1 1 1 0\n",
" 0.8 1 1 1\n",
" 1 1 0.666667 1\n",
" 0.75 0.75 0.666667 1\n",
" 1 1 0.333333 1\n",
" 1 1 1 1\n",
" 1 1 1 1\n",
" 0.8 1 0.833333 1\n",
"\n",
"#### Averages ####\n",
"\n",
" Recall (Strict) Recall Precision Gen. Def. Validity\n",
"----------------- -------- ----------- --------------------\n",
" 0.914172 0.981293 0.784694 0.837302\n"
]
}
],
"execution_count": 63
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|