维基百科词库中文建模练习
维基百科词库中文建模练习
Alex维基百科词库中文建模练习
wiki词库下载
xml 文本抽取
- 使用Wikipedia Extractor抽取正文
1
2
3
4
5
6git clone https://github.com/attardi/wikiextractor.git wikiextractor
cd wikiextractor
python setup.py install
./WikiExtractor.py -b 500M -o extracted zhwiki-latest-pages-articles.xml.bz2
### -o extracted -o 制定输出目录 运行完查看目录抽取文件
### -b 文件大小 默认是1M
- 使用Wikipedia Extractor抽取正文
繁简转换
1
2
3
4brew install opencc
opencc -i wiki_00 -o zh_wiki_00 -c zht2zhs.ini
### zht2zhs.ini 报错使用下面配置
opencc -i wiki_00 -o wiki_00_zh.txt -c t2s.json特殊字符处理
该步骤包含分词,剔除标点符号和去文章结构标识。(建议word2vec训练数据不要去除标点符号,比如在情感分析应用中标点符号很有用)最终将得到分词好的纯文本文件,每行对应一篇文章,词语间以空格作为分隔符。script_seg.py如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, codecs
import jieba.posseg as pseg
reload(sys)
sys.setdefaultencoding('utf-8')
if __name__ == '__main__':
if len(sys.argv) < 3:
print "Usage: python script.py infile outfile"
sys.exit()
i = 0
infile, outfile = sys.argv[1:3]
output = codecs.open(outfile, 'w', 'utf-8')
with codecs.open(infile, 'r', 'utf-8') as myfile:
for line in myfile:
line = line.strip()
if len(line) < 1:
continue
if line.startswith('<doc'):
i = i + 1
if(i % 1000 == 0):
print('Finished ' + str(i) + ' articles')
continue
if line.startswith('</doc'):
output.write('\n')
continue
words = pseg.cut(line)
for word, flag in words:
if flag.startswith('x'):
continue
output.write(word + ' ')
output.close()
print('Finished ' + str(i) + ' articles')执行命令
1
time python script_seg.py wiki_00_output.txt seg_wiki_00_output.txt
去除空白行
1
sed '/^$/d' seg_wiki_00_output.txt > trim_seg_wiki_00_output.txt
分词预处理
Testjieba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs,sys
def cut_words(sentence):
#print sentence
return " ".join(jieba.cut(sentence)).encode('utf-8')
f=codecs.open('wiki.zh_output.text','r',encoding="utf8") ## 输入
target = codecs.open("wiki.zh_output_jieba.text", 'w',encoding="utf8") ## 输出
print ('open files')
line_num=1
line = f.readline()
while line:
print('---- processing ', line_num, ' article----------------')
line_seg = " ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num + 1
line = f.readline()
f.close()
target.close()
exit()
while line:
curr = []
for oneline in line:
#print(oneline)
curr.append(oneline)
after_cut = map(cut_words, curr)
target.writelines(after_cut)
print ('saved',line_num,'articles')
exit()
line = f.readline1()
f.close()
target.close()
# python Testjieba.py建模
word2vec_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25# -*- coding: utf-8 -*-
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print (globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
#python word2vec_model.py zh.jian.wiki.seg.txt wiki.zh.text.model wiki.zh.text.vector
#opencc -i wiki_texts.txt -o test.txt -c t2s.json1
python word2vec_model.py wiki.zh_output_jieba.text wiki.zh_output_jieba_model.model wiki.zh_output_jieba_model.vector
测试向量模型
test_model.py
1
2
3
4
5
6
7
8
9# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
en_wiki_word2vec_model = Word2Vec.load('wiki.zh_output_jieba_model.model')
res = en_wiki_word2vec_model.most_similar(u"数学")
print (res)
for i in res:
print(i[0])实例结果
1
2
3
4
5
6
7
8
9
10
11[(u'\u6d41\u884c\u6027', 0.8884406685829163), (u'\u8179\u6cfb', 0.8853926658630371), (u'\u8111\u819c\u708e', 0.8741780519485474), (u'\u6025\u6027', 0.8728883266448975), (u'\u6162\u6027', 0.8612829446792603), (u'\u8fc7\u654f\u6027', 0.8557898998260498), (u'\u5173\u8282\u708e', 0.8552945256233215), (u'\u75c7', 0.8550660610198975), (u'\u51fa\u8840', 0.8524866700172424), (u'\u766b\u75eb', 0.8476200103759766)]
流行性
腹泻
脑膜炎
急性
慢性
过敏性
关节炎
症
出血
癫痫