-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathBallotpedia.py
56 lines (47 loc) · 1.35 KB
/
Ballotpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import urllib2
import urllib
import os
import re
import cache
import encode
import lxml.html
import lxml
import wiki
"""
the results is a dictionary :
names
links
wp
"""
def parse_ballotwiki_page(x,reps,obj) :
d = cache.cachewp ('http://ballotpedia.org%s?printable=yes' % x)
html = lxml.html.document_fromstring( d )
return wiki.parse_wiki_page_links(html,reps,obj)
def parse(url) :
reps = {
'wp': {},
'names': {},
'links': {},
}
d = cache.cachewp (url)
myparser = lxml.etree.HTMLParser(encoding="utf-8")
html = lxml.etree.HTML(d, parser=myparser)
for r in html.xpath("//ol/li") :
for l in r.xpath("a"):
f_name_link = l.get("href")
f_name_element = l.text
obj = {
'links' : {
'homepage' : {}
},
'link' : f_name_link,
'name' : f_name_element
}
link = re.search("/([^\/]+)$",f_name_link).group(1)
link = urllib.unquote(link)
link = encode.decode(link)
""" we are going to collect all the links and point to the object """
# print link, f_name_element, f_name_link
reps['wp'][link]= parse_ballotwiki_page(f_name_link,reps,obj)
reps['names'][f_name_element]= obj
return reps