-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbase.py
More file actions
110 lines (98 loc) · 3.36 KB
/
base.py
File metadata and controls
110 lines (98 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from glob import glob
import pandas as pd
from pandasql import sqldf
base_path = "./data"
corpus = "MetaLing"
fpost = "judgments.csv"
exclude = [
"character-Ray-124-20250414",
"character-Ray-123-20250414"
]
jfiles = glob(f"{base_path}/{corpus}_*/*_judgments.csv")
ufiles = glob(f"{base_path}/{corpus}_*/*_uses.csv")
def get_df():
usedf = None
for filename in ufiles:
with open(filename) as csvdata:
if usedf is None:
usedf = pd.read_csv(csvdata, delimiter="\t")
else:
tdf = pd.read_csv(csvdata, delimiter="\t")
usedf = pd.concat([usedf,tdf])
judgedf = None
for filename in jfiles:
with open(filename) as csvdata:
if usedf is None:
judgedf = pd.read_csv(csvdata, delimiter="\t")
else:
tdf = pd.read_csv(csvdata, delimiter="\t")
judgedf = pd.concat([judgedf,tdf])
newdf = pd.read_csv(f"{base_path}/{fpost}", delimiter="\t")
startdf = pd.concat([judgedf,newdf]) if len(newdf) else judgedf.copy()
# only disagreement
testdf = sqldf(f"""
SELECT
u1.`context` AS context1, u1.indexes_target_token AS pos1, j.identifier1,
u2.`context` AS context2, u2.indexes_target_token AS pos2, j.identifier2,
j.lemma
--count, delta, timestamp,
--JULIANDAY() - JULIANDAY(timestamp) AS time
FROM (
SELECT
*
FROM (
SELECT
COUNT(*) AS count,
MIN(judgment) AS min,
MAX(judgment) AS max,
AVG(judgment) AS avg,
MAX(judgment)-MIN(judgment) AS delta,
MAX(timestamp) AS timestamp,
identifier1, identifier2,
GROUP_CONCAT(DISTINCT lemma) AS lemma
FROM startdf
WHERE judgment != 0
AND identifier1 NOT IN ('{"','".join(exclude)}')
AND identifier2 NOT IN ('{"','".join(exclude)}')
GROUP BY identifier1, identifier2
ORDER BY delta DESC, count DESC
) AS aggdf
WHERE delta > 0 AND count < 5
AND JULIANDAY() - JULIANDAY(timestamp) > 0.007 -- 10 minutes in days
-- WHERE (delta > 0 AND count < 5) OR count = 1
ORDER BY delta DESC
-- LIMIT 1
) AS j
JOIN usedf AS u1 ON u1.`identifier`=j.identifier1
JOIN usedf AS u2 ON u2.`identifier`=j.identifier2
""", locals())
# first unannotated, then disagreement
"""
SELECT
u1.`context` AS context1, u1.indexes_target_token AS pos1, j.identifier1,
u2.`context` AS context2, u2.indexes_target_token AS pos2, j.identifier2,
j.lemma
FROM (
SELECT
*
FROM (
SELECT
COUNT(*) AS count,
MIN(judgment) AS min,
MAX(judgment) AS max,
AVG(judgment) AS avg,
MAX(judgment)-MIN(judgment) AS delta,
identifier1, identifier2,
GROUP_CONCAT(DISTINCT lemma) AS lemma
FROM startdf
GROUP BY identifier1, identifier2
ORDER BY delta DESC, count DESC
) AS aggdf
WHERE (delta > 0 AND count < 5) OR count = 1
ORDER BY count ASC, delta DESC
LIMIT 1
) AS j
JOIN usedf AS u1 ON u1.`identifier`=j.identifier1
JOIN usedf AS u2 ON u2.`identifier`=j.identifier2
"""
return testdf