This repository was archived by the owner on Nov 22, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathselectLowestShingleDist.py
87 lines (65 loc) · 1.71 KB
/
selectLowestShingleDist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""docstring
"""
__revision__ = '0.1'
import sys,os
import getopt
import gzip
import unlzw
# import zlib
# import matplotlib.pyplot as plt
# import matplotlib.ticker as mtick
# python splitTRECDoc.py --input=test/ --output=QAData/
def usage():
print """python yourFile.py
--help
--baseline=baseline dump
--new=new dump
--topic=topic id
--maxn=max N
"""
def error():
usage()
sys.exit(-1)
def cmdProcess(argv):
myArgs={
"defaulArgument1":"",
}
try:
opts, args = getopt.getopt(argv,"h",["help","input="])
except getopt.GetoptError:
error()
for opt, arg in opts:
if opt in ("--help","-h"):
usage()
sys.exit()
else:
opt="".join(opt[2:])
myArgs[opt]=arg
return myArgs
if __name__=="__main__":
argvNum=1
if len(sys.argv)<=argvNum:
error()
myArgs=cmdProcess(sys.argv[1:])
input=myArgs['input']
scoreDict = {}
docDict = {}
restDict={}
with open(input, 'r') as BMF:
for line in BMF:
tid = line.split()[0]
did = line.split()[1]
score = float(line.split()[7])
if tid in scoreDict:
if scoreDict[tid] < score:
scoreDict[tid] = score
restDict[tid] = line.split()[2:]
docDict[tid] = did
else:
scoreDict[tid] = score
restDict[tid] = line.split()[2:]
docDict[tid] = did
for topic in docDict:
print topic +'\t'+docDict[topic]+'\t'+'\t'.join(restDict[topic])