使用不同的分词器科领抠像拍照软件,最终的关键词不同关键词分词器绿色版照片转手绘软件(SoftOrbits Sketch Drawer),所需的时间也不同
用中文分词是个不错的选择,但是对比时间关键词分词器绿色版devCad(CAD建模软件),在我的电脑上分词大约需要800+ms
标记器工作流程:
输入文字(你叫什么名字?)
→关键词划分(What's ; your ; name),不同的分词器对不同的分词方法不同
→删除停用词()
→形式恢复(什么 -> 什么)
→转换为小写(What -> what)
private long stime;
private long etime;
private Analyzer analyzer;
@Before
public void s(){
stime = System.currentTimeMillis();
}
@After
public void e(){
etime = System.currentTimeMillis();
System.out.println("使用" + analyzer.getClass().getName() + "分词, 耗时" + (etime - stime) + "ms");
}
@Test
public void test() throws Exception {
//analyzer = new SimpleAnalyzer(Version.LUCENE_35);
//analyzer = new StandardAnalyzer(Version.LUCENE_35);
analyzer = new IKAnalyzer();
analyze(analyzer, "hTTp://www.baidu.com/s?wd=Lucene中文分词");
}
private void analyze(Analyzer analyzer, String text) throws Exception {
TokenStream tokens = analyzer.reusableTokenStream("content", new StringReader(text));
OffsetAttribute offsetAttr = tokens.getAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttr = tokens.getAttribute(CharTermAttribute.class);
while (tokens.incrementToken()) {
char[] charBuf = charTermAttr.buffer();
String term = new String(charBuf, 0, offsetAttr.endOffset() - offsetAttr.startOffset());
System.out.println(term + ", " + offsetAttr.startOffset() + ", " + offsetAttr.endOffset());
}
tokens.close();
// while (ts.incrementToken()) {//过时
// TermAttribute ta = ts.getAttribute(TermAttribute.class);
// System.out.println(ta.term());
// }
}
发表评论