crazyyanchao
diff --git a/‎IDEA开源社.txt
+572,857 b/‎IDEA开源社.txt
+572,857
diff --git a/‎news.txt
+21 b/‎news.txt
+21
diff --git a/‎news_5_18.txt
+2,060 b/‎news_5_18.txt
+2,060
diff --git a/‎news_NewWordFound/FoundNewWords.py
+56 b/‎news_NewWordFound/FoundNewWords.py
+56
diff --git a/‎news_NewWordFound/FoundNewWords_2.py
+89 b/‎news_NewWordFound/FoundNewWords_2.py
+89
diff --git a/‎news_NewWordFound/FoundNewWords_2_BackUp.py
+79 b/‎news_NewWordFound/FoundNewWords_2_BackUp.py
+79
diff --git a/‎news_NewWordFound/FoundNewWords_pymysql.py
+13 b/‎news_NewWordFound/FoundNewWords_pymysql.py
+13
diff --git a/‎news_NewWordFound/event_news_ref.txt b/‎news_NewWordFound/event_news_ref.txt
diff --git a/‎news_NewWordFound/python_mysql.py
+90 b/‎news_NewWordFound/python_mysql.py
+90
@@ -0,0 +1,21 @@
+原标题：形势严峻！这个地方书记市长纪委书记为何连续空降？
+市委书记、市长、市委副书记接连落马的广东江门市，政治生态修复从补齐关键岗位开始。
+在连续迎来空降市委书记、市长候选人后，江门新一任纪委书记近日也到岗了。值得关注的是，他也是从省里空降的，还是纪检部门。
+这位新纪委书记叫项天保，任职省纪委26年，在案例管理、派驻机构、巡视部门等关键岗位都工作过，经验十分丰富。
+去年底，江门成立市委巡察工作机构，时任省委巡视办副主任的项天保亲赴江门参加了启动仪式。
+长安街知事APP此前曾介绍过，江门是腐败的重灾区，市委书记毛荣楷，市长邓伟根，市委副书记、政法委书记邹家军，市委常委王积俊，市人大常委会副主任聂党权（曾任市委副书记）落马，班子塌方全国罕见。
+“中央派来了一个沙瑞金（省委书记），又派来了一个田国富（纪委书记）”，这是《人民的名义》里的一个情节，以此说明推动从严治党的迫切性。
+江门的情况与此类似，市委书记林应武、市长候选人刘毅都是从省委组织部副部长任上调来江门的，补位落马前任。如今新纪委书记又从省级纪检部门调来，从一个侧面也反映出地方反腐形势的严峻性。
+就在项天保就任的会议上，前任纪委书记胡钛也以新身份亮相，他已经出任市委副书记、政法委书记。也就是说，现在江门市委常委班子中，有两名来自纪检系统的领导。
+胡钛是军转干部，2016年底刚刚调任江门市纪委书记。他有两次“救火”经历，一次是梅州，一次是江门。
+2014年，梅州市委书记朱泽君和纪委书记李纯德相继被调离，此后又相继被查，媒体对两人“内斗”多有报道。胡钛正是接替了李的梅州纪委书记职务。
+而去年赴江门履新，正是该市市委书记毛荣楷和市委副书记邹家军落马之后。
+胡钛之前的江门市纪委书记周伟万，也是一名老纪检，在纪检、政法战线工作了30年，今年初当选市政协主席。
+面对从严治党的新形势和班子塌方的旧局面，接力反腐，任重道远。
+近日召开的江门全市领导干部大会上，广东省委常委、组织部长邹铭根据省委书记胡春华同志的指示，对全市领导干部提出三点要求，其中特别指出——
+要进一步严明政治纪律和政治规矩，营造良好的政治生态。要保持干部队伍思想稳定和改革发展大局稳定，积极引导广大干部群众把违纪违法的个人问题与江门整体工作区分开来，不因人废事，不因案划线，不因此否定江门的工作，影响江门的发展稳定。
+营造良好的政治生态，更好地推动发展，无疑是江门工作当下的重中之重。
+来源：长安街知事
+责任编辑：初晓慧
+文章关键词： 纪委书记 市长 纪检
+我要反馈 保存网页
@@ -0,0 +1,56 @@
+# 基于切分的新词发现是对	新词发现的信息熵方法的改进，降低计算量
+import pymongo # PyMongo是MongoDB的Python接口开发包。
+
+db = pymongo.MongoClient().baike.items
+def texts():
+    for a in db.find(no_cursor_timeout=True).limit(1000000):
+        yield a['content']
+
+from collections import defaultdict
+from itertools import tee
+from tqdm import tqdm
+import re
+'''Tqdm 是一个快速，可扩展的Python进度条，可以在 Python 长循环中添加一个进度提示信息，用户只需要封装任意的迭代器 tqdm(iterator)。'''
+
+class Find_Words:
+    def __init__(self, min_count=10, min_proba=1):
+        self.min_count = min_count
+        self.min_proba = min_proba
+        self.chars, self.pairs = defaultdict(int), defaultdict(int)
+        self.total = 0.
+    def text_filter(self, texts):
+        for a in tqdm(texts):
+            for t in re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]+', a):
+                if t:
+                    yield t
+    def count(self, texts):
+        for text in self.text_filter(texts):
+            self.chars[text[0]] += 1
+            for i in range(len(text)-1):
+                self.chars[text[i+1]] += 1
+                self.pairs[text[i:i+2]] += 1
+                self.total += 1
+        self.chars = {i:j for i,j in self.chars.iteritems() if j >= self.min_count}
+        self.pairs = {i:j for i,j in self.pairs.iteritems() if j >= self.min_count}
+        self.strong_segments = {i: self.total*j/(self.chars[i[0]]*self.chars[i[1]]) for i,j in self.pairs.iteritems()}
+        self.strong_segments = {i:j for i,j in self.strong_segments.iteritems() if j >= self.min_proba}
+    def find_words(self, texts):
+        self.words = defaultdict(int)
+        self.total_words = 0.
+        for text in self.text_filter(texts):
+            s = text[0]
+            for i in range(len(text)-1):
+                if text[i:i+2] in self.strong_segments:
+                    s += text[i+1]
+                else:
+                    self.words[s] += 1
+                    self.total_words += 1
+                    s = text[i+1]
+        self.words = {i:j for i,j in self.words.iteritems() if j >= self.min_count}
+
+fw = Find_Words(16, 1)
+fw.count(texts())
+fw.find_words(texts())
+
+import pandas as pd
+words = pd.Series(fw.words).sort_values(ascending=False)
@@ -0,0 +1,89 @@
+# 	新词发现的信息熵方法与实现
+import numpy as np
+import pandas as pd
+import re
+from numpy import log, min
+import pymysql
+
+# 连接数据库
+conn = pymysql.connect(host='127.0.0.1',port = 3306,user = 'root',
+                       passwd = '123456',db = 'news_5_23')
+c = conn.cursor()
+
+# 查询多条数据fetchAll(文件存在之后就不需要重复创建)
+# c.execute("select content from event_news_ref into outfile '/var/lib/mysql-files/event_news_ref.txt'; ")
+# r = c.fetchone()
+
+f = open('/home/yanchao/PyCharmProject/TextSummary/event_news_ref.txt', 'r')  # 读取文章
+s = f.read()  # 读取为一个字符串
+
+# 定义要去掉的标点字或者字段
+drop_dict = [u'，', u'\n', u'。', u'、', u'：', u'(', u')', u'[', u']', u'.', u',', u' ', u'\u3000', u'”', u'“', u'？', u'?',
+             u'！', u'‘', u'’',u'(',u')',u'《',u'》', u'（',u'）',u'…',u'-',u'0',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',
+             u':',u'q',u'w',u'e',u'r',u't',u'y',u'u',u'i',u'o',u'u',u'p',u'a',u's',u'd',u'f',u'g',u'h',u'j',u'k',u'l',u'z',
+             u'x',u'c',u'v',u'b',u'n',u'm',u'<',u'>',u'@',u'!',u'#',u'$',u'%',u'^',u'&',u'*',u'/',u'?',u'~',u'Q',u'W',u'E',
+             u'R',u'T',u'Y',u'U',u'I',u'O',u'P',u'A',u'S',u'D',u'F',u'G','H',u'J',u'K',u'L',u'Z',u'X',u'C',u'V',u'B',u'N',u'M',
+             u'【',u'】',u'|',u'à',u'╰',u'{',u'=',u';',u',',u'﹌﹌﹌']
+for i in drop_dict:  # 去掉标点字或者字段
+    s = s.replace(i, '')
+
+# 为了方便调用，自定义了一个正则表达式的词典
+myre = {2: '(..)', 3: '(...)', 4: '(....)', 5: '(.....)', 6: '(......)', 7: '(.......)'}
+
+min_count = 10  # 录取词语最小出现次数
+min_support = 30  # 录取词语最低支持度，1代表着随机组合
+min_s = 3  # 录取词语最低信息熵，越大说明越有可能独立成词
+max_sep = 4  # 候选词语的最大字数
+t = []  # 保存结果用。
+
+t.append(pd.Series(list(s)).value_counts())  # 逐字统计
+tsum = t[0].sum()  # 统计总字数
+rt = []  # 保存结果用
+
+for m in range(2, max_sep + 1):
+    print(u'正在生成%s字词...' % m)
+    t.append([])
+    for i in range(m):  # 生成所有可能的m字词
+        t[m - 1] = t[m - 1] + re.findall(myre[m], s[i:])
+
+    t[m - 1] = pd.Series(t[m - 1]).value_counts()  # 逐词统计
+    t[m - 1] = t[m - 1][t[m - 1] > min_count]  # 最小次数筛选
+    tt = t[m - 1][:]
+    for k in range(m - 1):
+        qq = np.array(list(map(lambda ms: tsum * t[m - 1][ms] / t[m - 2 - k][ms[:m - 1 - k]] / t[k][ms[m - 1 - k:]],
+                               tt.index))) > min_support  # 最小支持度筛选。
+        tt = tt[qq]
+    rt.append(tt.index)
+
+
+def cal_S(sl):  # 信息熵计算函数
+    return -((sl / sl.sum()).apply(log) * sl / sl.sum()).sum()
+
+
+for i in range(2, max_sep + 1):
+    print(u'正在进行%s字词的最大熵筛选(%s)...' % (i, len(rt[i - 2])))
+    pp = []  # 保存所有的左右邻结果
+    for j in range(i):
+        pp = pp + re.findall('(.)%s(.)' % myre[i], s[j:])
+    pp = pd.DataFrame(pp).set_index(1).sort_index()  # 先排序，这个很重要，可以加快检索速度
+    index = np.sort(np.intersect1d(rt[i - 2], pp.index))  # 作交集
+    # 下面两句分别是左邻和右邻信息熵筛选
+    index = index[np.array(list(map(lambda s: cal_S(pd.Series(pp[0][s]).value_counts()), index))) > min_s]
+    rt[i - 2] = index[np.array(list(map(lambda s: cal_S(pd.Series(pp[2][s]).value_counts()), index))) > min_s]
+
+# # 下面都是输出前处理
+# for i in range(len(rt)):
+#     t[i + 1] = t[i + 1][rt[i]]
+#     t[i + 1].sort(ascending=False)
+
+# 保存结果并输出
+pd.DataFrame(pd.concat(t[1:])).to_csv('result.txt', header=False)
+
+# 性能分析模块：
+# python -m cProfile -o FoundNewWords_2.out FoundNewWords_2.
+# 随机排序：
+# python -m cProfile FoundNewWords.py
+# 按耗时排序：
+# python -c "import pstats; p=pstats.Stats('FoundNewWords_2.out'); p.sort_stats('time').print_stats()"
+
+
@@ -0,0 +1,79 @@
+# 	新词发现的信息熵方法与实现
+import numpy as np
+import pandas as pd
+import re
+from numpy import log, min
+
+f = open('机器学习.txt', 'r')  # 读取文章
+s = f.read()  # 读取为一个字符串
+
+# 定义要去掉的标点字或者字段
+drop_dict = [u'，', u'\n', u'。', u'、', u'：', u'(', u')', u'[', u']', u'.', u',', u' ', u'\u3000', u'”', u'“', u'？', u'?',
+             u'！', u'‘', u'’',u'(',u')',u'《',u'》', u'（',u'）',u'…',u'-',u'0',u'1',u'2',u'3',u'4',u'5',u'6',u'7',u'8',u'9',
+             u':',u'q',u'w',u'e',u'r',u't',u'y',u'u',u'i',u'o',u'u',u'p',u'a',u's',u'd',u'f',u'g',u'h',u'j',u'k',u'l',u'z',
+             u'x',u'c',u'v',u'b',u'n',u'm',u'<',u'>',u'@',u'!',u'#',u'$',u'%',u'^',u'&',u'*',u'/',u'?',u'~',u'Q',u'W',u'E',
+             u'R',u'T',u'Y',u'U',u'I',u'O',u'P',u'A',u'S',u'D',u'F',u'G','H',u'J',u'K',u'L',u'Z',u'X',u'C',u'V',u'B',u'N',u'M',
+             u'【',u'】',u'|',u'à',u'╰',u'{',u'=',u';',u',',u'﹌﹌﹌']
+for i in drop_dict:  # 去掉标点字或者字段
+    s = s.replace(i, '')
+
+# 为了方便调用，自定义了一个正则表达式的词典
+myre = {2: '(..)', 3: '(...)', 4: '(....)', 5: '(.....)', 6: '(......)', 7: '(.......)'}
+
+min_count = 10  # 录取词语最小出现次数
+min_support = 30  # 录取词语最低支持度，1代表着随机组合
+min_s = 3  # 录取词语最低信息熵，越大说明越有可能独立成词
+max_sep = 4  # 候选词语的最大字数
+t = []  # 保存结果用。
+
+t.append(pd.Series(list(s)).value_counts())  # 逐字统计
+tsum = t[0].sum()  # 统计总字数
+rt = []  # 保存结果用
+
+for m in range(2, max_sep + 1):
+    print(u'正在生成%s字词...' % m)
+    t.append([])
+    for i in range(m):  # 生成所有可能的m字词
+        t[m - 1] = t[m - 1] + re.findall(myre[m], s[i:])
+
+    t[m - 1] = pd.Series(t[m - 1]).value_counts()  # 逐词统计
+    t[m - 1] = t[m - 1][t[m - 1] > min_count]  # 最小次数筛选
+    tt = t[m - 1][:]
+    for k in range(m - 1):
+        qq = np.array(list(map(lambda ms: tsum * t[m - 1][ms] / t[m - 2 - k][ms[:m - 1 - k]] / t[k][ms[m - 1 - k:]],
+                               tt.index))) > min_support  # 最小支持度筛选。
+        tt = tt[qq]
+    rt.append(tt.index)
+
+
+def cal_S(sl):  # 信息熵计算函数
+    return -((sl / sl.sum()).apply(log) * sl / sl.sum()).sum()
+
+
+for i in range(2, max_sep + 1):
+    print(u'正在进行%s字词的最大熵筛选(%s)...' % (i, len(rt[i - 2])))
+    pp = []  # 保存所有的左右邻结果
+    for j in range(i):
+        pp = pp + re.findall('(.)%s(.)' % myre[i], s[j:])
+    pp = pd.DataFrame(pp).set_index(1).sort_index()  # 先排序，这个很重要，可以加快检索速度
+    index = np.sort(np.intersect1d(rt[i - 2], pp.index))  # 作交集
+    # 下面两句分别是左邻和右邻信息熵筛选
+    index = index[np.array(list(map(lambda s: cal_S(pd.Series(pp[0][s]).value_counts()), index))) > min_s]
+    rt[i - 2] = index[np.array(list(map(lambda s: cal_S(pd.Series(pp[2][s]).value_counts()), index))) > min_s]
+
+# # 下面都是输出前处理
+# for i in range(len(rt)):
+#     t[i + 1] = t[i + 1][rt[i]]
+#     t[i + 1].sort(ascending=False)
+
+# 保存结果并输出
+pd.DataFrame(pd.concat(t[1:])).to_csv('result.txt', header=False)
+
+# 性能分析模块：
+# python -m cProfile -o FoundNewWords_2.out FoundNewWords_2.
+# 随机排序：
+# python -m cProfile FoundNewWords.py
+# 按耗时排序：
+# python -c "import pstats; p=pstats.Stats('FoundNewWords_2.out'); p.sort_stats('time').print_stats()"
+
+
@@ -0,0 +1,13 @@
+import pymysql
+
+# 连接数据库
+conn = pymysql.connect(host='127.0.0.1',port = 3306,user = 'root',
+                       passwd = '123456',db = 'news_5_23')
+c = conn.cursor()
+
+# 查询多条数据fetchAll
+c.execute("select content from event_news_ref into outfile '/var/lib/mysql-files/event_news_ref.txt'; ")
+r = c.fetchone()
+print(r)
+# f = open('event_news_ref.txt', 'w')
+print("write to .txt file!")
@@ -0,0 +1,90 @@
+# #-------------------python2.7连接MySQL数据库------------------------
+#
+# #!/usr/bin/env python
+#
+# import time
+# import MySQLdb   # 不支持Python3
+#
+# #连接
+# conn = MySQLdb.connect(host="localhost",user="root",passwd="root",db="test",charset="utf8")
+# cursor = conn.cursor()
+#
+# #删除表
+# sql = "drop table if exists user"
+# cursor.execute(sql)
+#
+# #创建
+# sql = "create table if not exists user(name varchar(128) primary key,created int(10))"
+# cursor.execute(sql)
+#
+# #写入
+# sql = "insert into user(name,created) values(%s,%s)"
+# param = ("aaa",int(time.time()))
+# n = cursor.execute(sql,param)
+# print ('insert',n)
+#
+# #写入多行
+# sql = "insert into user(name,created) values(%s,%s)"
+# param = (("bbb",int(time.time())),("ccc",33),("ddd",44))
+# print ('insertmany',n)
+#
+# #更新
+# sql = "update user set name=%s where name='aaa'"
+# param = ("zzz")
+# n = cursor.execute(sql,param)
+# print ('update',n)
+#
+# #查询
+# n = cursor.execute("select * from user")
+# for row in cursor.fetchall():
+# 	print (row)
+# 	for r in row:
+# 		print (r)
+#
+# #删除 注意：MYSQL的占位符是s%
+# sql = "delete from user where name=%s"
+# param = ("bbb")
+# n = cursor.execute(sql,param)
+# print ('delete',n)
+#
+# #查询
+# n = cursor.execute("select * from user")
+# print (cursor.fetchall())
+#
+# cursor.close()
+#
+# #提交
+# coon.commit()
+# #关闭
+# conn.close()
+#
+# # -------------------------------------------------------------
+# # 以mysql或者sqlite为例，请用代码给出简洁且完整的数据库操作示例。注：请参考视频中的代码。
+# # -------------------------------------------------------------
+# # [参考代码：]
+#
+# # 导入MySQL驱动:
+# import mysql
+# from mysql import connector
+#
+# # import mysql.connector
+# # 注意把password设为你的root口令:
+# conn = mysql.connector.connect(user='root', password='password', database='test', use_unicode=True)
+# cursor = conn.cursor()
+# # 创建user表:
+# cursor.execute('create table user (id varchar(20) primary key, name varchar(20))')
+# # 插入一行记录，注意MySQL的占位符是%s:
+# cursor.execute('insert into user (id, name) values (%s, %s)', ['1', 'Michael'])
+#
+# # 提交事务:
+# conn.commit()
+# cursor.close()
+# # 运行查询:
+# cursor = conn.cursor()
+# cursor.execute('select * from user where id = %s', '1')
+# values = cursor.fetchall()
+# print (values)
+#
+# # 关闭Cursor和Connection:
+# cursor.close()
+# conn.close()