Skip to content

Commit 5366b4c

Browse files
committed
utf8 issues resolved, whew
1 parent 1a19eca commit 5366b4c

8 files changed

+124
-23
lines changed

Current_members_of_the_United_States_House_of_Representatives.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1-
import lxml.html
1+
22
import urllib2
33
import urllib
44
import os
55
import re
66
import cache
7+
import encode
8+
import lxml.html
9+
import lxml
10+
711
"""
812
the results is a dictionary :
913
names
1014
links
15+
wp
1116
1217
"""
1318

@@ -33,7 +38,10 @@ def parse_rep() :
3338
'links': {},
3439
}
3540
d = cache.cachewp ('http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes')
36-
html = lxml.html.document_fromstring( d )
41+
42+
myparser = lxml.etree.HTMLParser(encoding="utf-8")
43+
html = lxml.etree.HTML(d, parser=myparser)
44+
3745
tables = html.xpath("//table")
3846
table = tables[1]
3947
for r in table.xpath("//tr") :
@@ -42,22 +50,30 @@ def parse_rep() :
4250
f_district = data[1]
4351
f_image = data[2]
4452
f_name = data[3]
45-
(skip, skip , f_district_link, skip) =f_district.iterlinks().next()
46-
(f_name_element, skip , f_name_link, skip) =f_name.iterlinks().next()
53+
f_name_link = ""
54+
f_name_element = ""
55+
f_district_link=""
56+
for l in f_name.xpath("span/span/a"):
57+
f_name_link = l.get("href")
58+
f_name_element = l.text
59+
60+
for l in f_district.xpath("span/span/a"):
61+
f_district_link = l.get("href")
62+
4763
obj = {
4864
'link' : f_name_link,
4965
'district' : f_district_link,
50-
'name' : f_name_element.text
66+
'name' : f_name_element
5167
}
52-
reps['names'][f_name_element.text]= obj
68+
reps['names'][f_name_element]= obj
5369

5470
link = re.search("/([^\/]+)$",f_name_link).group(1)
5571
link = urllib.unquote(link)
56-
# link=link.encode('ascii', 'ignore')
72+
link = encode.decode(link)
5773
reps['wp'][link]= obj
5874

5975
""" we are going to collect all the links and point to the object """
60-
parse_wiki_page(f_name_link,reps,obj)
76+
# parse_wiki_page(f_name_link,reps,obj)
6177

6278
return reps
6379

List_of_current_United_States_Senators.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
import cache
7+
import encode
78
# List_of_current_United_States_Senators
89
"""
910
the results is a dictionary :
@@ -12,6 +13,7 @@
1213
wp
1314
"""
1415

16+
1517
def parse_wiki_page_links(d,reps,obj):
1618
for (f_name_element, attr , f_link, pos) in d.iterlinks():
1719
if(attr == 'href'):
@@ -56,11 +58,12 @@ def parse() :
5658

5759
link = re.search("/([^\/]+)$",f_name_link).group(1)
5860
link = urllib.unquote(link)
59-
# link=link.encode('ascii', 'ignore')
61+
link = encode.decode(link)
62+
6063
reps['wp'][link]= obj
6164

6265
""" we are going to collect all the links and point to the object """
63-
parse_wiki_page(f_name_link,reps,obj)
66+
# parse_wiki_page(f_name_link,reps,obj)
6467

6568
return reps
6669

cache.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
import urllib2
55
import urllib
6-
6+
import codecs
77
def cache (x,f) :
88

99
filename = "data/%s.pkl" % x
@@ -23,18 +23,17 @@ def cache (x,f) :
2323
def cachewp (url) :
2424
data = cacheweb(url)
2525
if (re.search("Redirected from",data)):
26-
#raise Exception( " redirect %s" % url)
27-
print "redirect %s" % url
26+
raise Exception( " redirect %s" % url)
27+
#print "redirect %s" % url
2828
return data
2929

3030

3131
def cacheweb (url) :
3232
# print url
3333
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
3434
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35-
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
36-
'Accept-Encoding': 'none',
37-
'Accept-Language': 'en-US,en;q=0.8',
35+
'Accept-Charset': 'utf-8',
36+
'Accept-Language': 'en-US',
3837
'Connection': 'keep-alive'}
3938
url2=url
4039
url2=url2.replace("/","_")
@@ -44,13 +43,16 @@ def cacheweb (url) :
4443
os.makedirs("data")
4544

4645
if (os.path.exists(filename)):
47-
f =open(filename,'r')
48-
return f.read()
46+
f = codecs.open(filename, "rb", "utf-8")
47+
data= f.read()
48+
return data
4949
else:
5050
r = urllib2.Request(url=url, headers=hdr )
5151
d = urllib2.urlopen(r)
5252
data= d.read()
53-
f =open(filename,'w')
53+
data = data.decode("utf-8")
54+
f = codecs.open(filename,'wb','utf-8')
55+
5456
f.write(data)
5557
return data
5658

encode.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import unicodedata
2+
3+
def decode(link) :
4+
b = link
5+
link = unicode(link, 'utf-8')
6+
link = unicodedata.normalize('NFKD', link)
7+
return strip(link)
8+
9+
def decodeuc(link) :
10+
b = link
11+
link = unicode(link)
12+
link = unicodedata.normalize('NFKD', link)
13+
return strip(link)
14+
15+
16+
def strip(link) :
17+
b = link
18+
19+
link = link.encode('ascii','ignore')
20+
if (link != b):
21+
print "Before %s After %s:" % (b, link)
22+
return link

legislators_current.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
import yaml
22
import cache
3-
3+
import encode
4+
import codecs
45

56
def loadlegis ():
6-
legis = yaml.load(file('congress-legislators/legislators-current.yaml', 'rb').read())
7+
filename ='congress-legislators/legislators-current.yaml'
8+
f = codecs.open(filename, "rb", "utf-8")
9+
data = f.read()
10+
legis = yaml.load(data)
711
return legis
812

913
def load():
@@ -15,8 +19,7 @@ def load():
1519
if 'wikipedia' in l['id'] :
1620
wp = l['id']['wikipedia']
1721
wp = wp.replace(" ","_")
18-
# wp = wp.decode("utf8")
19-
# wp=wp.encode('ascii', 'ignore')
22+
wp = encode.decodeuc(wp)
2023
data['wp'][wp]=l
2124
return data
2225

test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
import encode
4+
wiki = u"Rubén Hinojosa"
5+
legis = u"Rubén Hinojosa"
6+
7+
print encode.decode(wiki)
8+
print encode.decode(legis)

test_legis.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import os
2+
import legislators_current as leg
3+
import cache
4+
import urllib
5+
import encode
6+
import re
7+
from cStringIO import StringIO
8+
9+
legs= leg.load()
10+
#print legs
11+
#for i in legs.keys() :
12+
# print i
13+
# n= i[1]['name']
14+
# n2 = encode.decode(n)
15+
# print n,n2,i
16+
17+
18+
print "REPS:",sorted(legs['wp'].keys())
19+

test_reps.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import os
2+
#import socrata_rows as soc
3+
import Current_members_of_the_United_States_House_of_Representatives as reps
4+
import cache
5+
import urllib
6+
import encode
7+
import re
8+
from cStringIO import StringIO
9+
10+
rep= cache.cache('reps',reps.parse_rep)
11+
12+
def foo(link):
13+
link = urllib.unquote(link)
14+
link = re.search("/([^\/]+)$",link).group(1)
15+
link = encode.decode(link)
16+
# print link
17+
return link
18+
19+
#print rep
20+
for i in rep['names'].items() :
21+
# print i
22+
l= i[1]['link']
23+
l2 = foo(l)
24+
# print l2
25+
26+
27+
#print "REPS:",sorted(rep['wp'].keys())
28+

0 commit comments

Comments
 (0)