utf8 issues resolved, whew

h4ck3rm1k3 · h4ck3rm1k3 · commit 5366b4c01a53 · 2013-06-24T07:55:34.000-05:00
diff --git a/Current_members_of_the_United_States_House_of_Representatives.py b/Current_members_of_the_United_States_House_of_Representatives.py
@@ -1,13 +1,18 @@
-import lxml.html
+
 import urllib2
 import urllib
 import os
 import re
 import cache
+import encode
+import lxml.html
+import lxml
+
 """ 
 the results is a dictionary :
 names
 links
+wp
 
 """ 
 
@@ -33,7 +38,10 @@ def parse_rep() :
     'links': {},
     }
     d = cache.cachewp ('http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes')
-    html = lxml.html.document_fromstring(  d  )
+
+    myparser = lxml.etree.HTMLParser(encoding="utf-8")
+    html = lxml.etree.HTML(d, parser=myparser)
+
     tables = html.xpath("//table")
     table = tables[1]
     for r in table.xpath("//tr") :
@@ -42,22 +50,30 @@ def parse_rep() :
             f_district = data[1]
             f_image     = data[2]
             f_name     = data[3]
-            (skip, skip , f_district_link, skip) =f_district.iterlinks().next()
-            (f_name_element, skip , f_name_link, skip) =f_name.iterlinks().next()
+            f_name_link = ""
+            f_name_element = ""
+            f_district_link=""
+            for l in f_name.xpath("span/span/a"):
+                f_name_link = l.get("href")
+                f_name_element = l.text
+
+            for l in f_district.xpath("span/span/a"):
+                f_district_link = l.get("href")
+
             obj = {
                 'link' :   f_name_link,
                 'district' :  f_district_link,
-                'name' : f_name_element.text
+                'name' : f_name_element
             }
-            reps['names'][f_name_element.text]= obj
+            reps['names'][f_name_element]= obj
 
             link = re.search("/([^\/]+)$",f_name_link).group(1)          
             link = urllib.unquote(link)
-#            link=link.encode('ascii', 'ignore')
+            link = encode.decode(link)
             reps['wp'][link]= obj
 
             """ we are going to collect all the links and point to the object """ 
-            parse_wiki_page(f_name_link,reps,obj)
+#            parse_wiki_page(f_name_link,reps,obj)
 
     return reps
 
diff --git a/List_of_current_United_States_Senators.py b/List_of_current_United_States_Senators.py
@@ -4,6 +4,7 @@
 import os
 import re
 import cache
+import encode 
 # List_of_current_United_States_Senators
 """ 
 the results is a dictionary :
@@ -12,6 +13,7 @@
 wp
 """ 
 
+
 def parse_wiki_page_links(d,reps,obj):
     for (f_name_element, attr , f_link, pos) in d.iterlinks():
         if(attr == 'href'):
@@ -56,11 +58,12 @@ def parse() :
 
             link = re.search("/([^\/]+)$",f_name_link).group(1)          
             link = urllib.unquote(link)
-#            link=link.encode('ascii', 'ignore')
+            link = encode.decode(link)
+
             reps['wp'][link]= obj
 
             """ we are going to collect all the links and point to the object """ 
-            parse_wiki_page(f_name_link,reps,obj)
+#            parse_wiki_page(f_name_link,reps,obj)
 
     return reps
 
diff --git a/cache.py b/cache.py
@@ -3,7 +3,7 @@
 import re
 import urllib2
 import urllib
-
+import codecs
 def cache (x,f) :
 
     filename = "data/%s.pkl" % x
@@ -23,18 +23,17 @@ def cache (x,f) :
 def cachewp (url) :
     data = cacheweb(url)
     if (re.search("Redirected from",data)):
-        #raise Exception( " redirect %s" % url)
-        print "redirect %s" % url
+        raise Exception( " redirect %s" % url)
+        #print "redirect %s" % url
     return data
 
 
 def cacheweb (url) :
 #    print url
     hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
-       'Accept-Encoding': 'none',
-       'Accept-Language': 'en-US,en;q=0.8',
+       'Accept-Charset': 'utf-8',
+       'Accept-Language': 'en-US',
        'Connection': 'keep-alive'}
     url2=url
     url2=url2.replace("/","_")
@@ -44,13 +43,16 @@ def cacheweb (url) :
         os.makedirs("data")
 
     if (os.path.exists(filename)):
-        f =open(filename,'r')
-        return f.read()
+        f = codecs.open(filename, "rb", "utf-8")
+        data= f.read()
+        return data
     else:
         r = urllib2.Request(url=url, headers=hdr     )
         d = urllib2.urlopen(r)
         data= d.read()
-        f =open(filename,'w')
+        data = data.decode("utf-8")
+        f = codecs.open(filename,'wb','utf-8')
+
         f.write(data)
         return data
 
diff --git a/encode.py b/encode.py
@@ -0,0 +1,22 @@
+import unicodedata
+
+def decode(link) :
+    b = link
+    link = unicode(link, 'utf-8')
+    link = unicodedata.normalize('NFKD', link)
+    return strip(link)
+
+def decodeuc(link) :
+    b = link
+    link = unicode(link)
+    link = unicodedata.normalize('NFKD', link)
+    return strip(link)
+
+
+def strip(link) :
+    b = link
+
+    link = link.encode('ascii','ignore')
+    if (link  != b):
+        print "Before %s After %s:" % (b, link)
+    return link
diff --git a/legislators_current.py b/legislators_current.py
@@ -1,9 +1,13 @@
 import yaml
 import cache
-
+import encode
+import codecs
 
 def loadlegis ():
-    legis = yaml.load(file('congress-legislators/legislators-current.yaml', 'rb').read())
+    filename ='congress-legislators/legislators-current.yaml'
+    f = codecs.open(filename, "rb", "utf-8")
+    data = f.read()
+    legis = yaml.load(data)
     return legis
 
 def load():
@@ -15,8 +19,7 @@ def load():
         if 'wikipedia' in l['id'] :
             wp = l['id']['wikipedia']
             wp = wp.replace(" ","_")
-#            wp = wp.decode("utf8")
-#            wp=wp.encode('ascii', 'ignore')
+            wp = encode.decodeuc(wp)
             data['wp'][wp]=l
     return data
 
diff --git a/test.py b/test.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import encode
+wiki = u"Rubén Hinojosa"
+legis = u"Rubén Hinojosa"
+
+print encode.decode(wiki)
+print encode.decode(legis)
diff --git a/test_legis.py b/test_legis.py
@@ -0,0 +1,19 @@
+import os
+import legislators_current as leg
+import cache 
+import urllib
+import encode
+import re 
+from cStringIO import StringIO
+
+legs= leg.load()
+#print legs
+#for i in legs.keys() :
+#    print i
+#    n= i[1]['name']
+#    n2 = encode.decode(n)
+#    print n,n2,i
+
+
+print "REPS:",sorted(legs['wp'].keys())
+
diff --git a/test_reps.py b/test_reps.py
@@ -0,0 +1,28 @@
+import os
+#import socrata_rows as soc
+import Current_members_of_the_United_States_House_of_Representatives as reps
+import cache 
+import urllib
+import encode
+import re 
+from cStringIO import StringIO
+
+rep= cache.cache('reps',reps.parse_rep)
+
+def foo(link):
+    link = urllib.unquote(link)
+    link = re.search("/([^\/]+)$",link).group(1)          
+    link = encode.decode(link)
+#    print link
+    return link
+
+#print rep
+for i in rep['names'].items() :
+#    print i
+    l= i[1]['link']
+    l2 = foo(l)
+#    print l2 
+
+
+#print "REPS:",sorted(rep['wp'].keys())
+