| 致远 的个人资料叉包的地下室照片日志列表 | 帮助 |
|
12月10日 用Pyhon写了个分词程序最近在看Python, 找东西连笔。
于是写了个分词程序,基于词长和概率的正向切词。效果还不错,最主要是只用了不到70行代码(加些技巧应该能在少一二十行)。
再看看 Peter Norvig 只用21行写了个单词纠错程序 ,也不再说啥了,继续学习去。
看得懂的扔砖吧。
#!/usr/bin/python
# -*- coding: utf-8 -*- """
A word segment class based on word fequence and length. Use dictionary from Sogoulab """ import codecs, re, collections, sets
class ChineseWordSegmenter: word_freq= collections.defaultdict(lambda: 1) word_incomplete=sets.Set() def __init__(self, verbose=0): self.dic_encoding = "gb18030" self.dic="SogouLabDic.dic" self.verbose=verbose self.LoadDict(self.dic) def LoadDict(self, dic): dicfile=open(dic, "r") while 1: lines = dicfile.readlines(10000) if not lines: break for line in lines: wordfeq= line.split() word = unicode(wordfeq[0], self.dic_encoding) self.word_freq[word]=int (wordfeq[1]) for i in range(2, len(word)-1): self.word_incomplete.add(word[:i]) def seg(self, text): latin, words, result='', '', [] text += u'\u3002' #add 。 for s in text: if (ord(s) < 0xff): latin += s elif (ord(s) >= 0x4e00 and ord(s)<= 0x9fff): #Unicode CJKUnifiedIdeographs words += s else: #punctuation result.extend(self._seg_word(words)) words='' # result.append(latin) #do we need latin words? return result def _seg_word(self, words):
res, i ,tmpword = [], 0,('', 0, 0) words += ' ' #add terminator while len(words)>1: j=2 while words[i:i+j] in self.word_incomplete: j+=1 while j>1 and words[i:i+j] not in self.word_freq: j-=1 currfeq=self.word_freq[words[i:i+j]] if tmpword[1]>currfeq or tmpword[2]>j: #based on length and probabilty res.append(tmpword[0]) words=words[tmpword[2]:] tmpword,i=('', 0, 0),0 elif tmpword[1]>0: res.append(tmpword[0][:1]) words=words[1:] tmpword=(words[:j], currfeq, j) else: tmpword,i=(words[i:i+j], currfeq, j),1 return res if __name__ == '__main__': wordseg= ChineseWordSegmenter(verbose=1) wordin=u"中国人民鞋子和服装有限公司的。apple yes(no)迄今为止,你主要是匹配整个模式,不论是匹配上,还是没有匹配上。鞋子和服装有限公司" wordout=wordseg.seg(wordin) print wordin.encode("gbk") print " ".join(["%s" % word for word in wordout]).encode("gbk") |
|
|