public class MyAnalyzer extends Analyzer { private CharArraySet stopsArraySet=null; public MyAnalyzer() { super(); // TODO Auto-generated constructor stub } public MyAnalyzer(CharArraySet stopsArraySet) { super(); this.stopsArraySet = stopsArraySet; } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { StandardTokenizer srcStandardTokenizer=new StandardTokenizer(Version.LUCENE_45, reader); TokenStream tokenStream=new StandardFilter(Version.LUCENE_45, srcStandardTokenizer); tokenStream=new LowerCaseFilter(Version.LUCENE_45, tokenStream); tokenStream=new StopFilter(Version.LUCENE_45, tokenStream,stopsArraySet); tokenStream=new MyTokenFilter(tokenStream); return new TokenStreamComponents(srcStandardTokenizer, tokenStream); } }
public void analysis( String str,Analyzer analyzer) { try { ts=analyzer.tokenStream("content", new StringReader(str)); CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute pia=ts.addAttribute(PositionIncrementAttribute.class); OffsetAttribute oa=ts.addAttribute(OffsetAttribute.class); TypeAttribute ta=ts.addAttribute(TypeAttribute.class); PositionLengthAttribute positionLengthAttribute=ts.addAttribute(PositionLengthAttribute.class); ts.reset(); while(ts.incrementToken()) { System.out.println("["+cta+"]:"+pia.getPositionIncrement()+" " +oa.startOffset()+" "+oa.endOffset()+" "+ta.type()+" "+ positionLengthAttribute.getPositionLength()); } ts.end(); } catch (IOException e) { e.printStackTrace(); }finally{ if(ts!=null) try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } }
public final class MyTokenFilter extends TokenFilter { private CharTermAttribute charTermAttribute; private PositionIncrementAttribute positionIncrementAttribute; private State currentstate; private Stack<String> sames=new Stack<String>(); protected MyTokenFilter(TokenStream input) { super(input); charTermAttribute=input.addAttribute(CharTermAttribute.class); positionIncrementAttribute=input.addAttribute(PositionIncrementAttribute.class); // TODO Auto-generated constructor stub } @Override public boolean incrementToken() throws IOException { // TODO Auto-generated method stub if(sames.size()>0) { restoreState(currentstate); charTermAttribute.setEmpty(); String top=sames.pop(); charTermAttribute.append(top); positionIncrementAttribute.setPositionIncrement(0); return true; } if(!input.incrementToken()) return false; getsamewords(charTermAttribute.toString()); return true; } public void getsamewords(String name) { Map<String,String[]> maps=new HashMap<String, String[]>(); maps.put("zhongguo",new String[]{"dalu","tianchao"}); maps.put("wo", new String[]{"zan","me"}); String[] searchStrings=maps.get(name); if(searchStrings==null) return; for(int i=0;i<maps.get(name).length;i++) { sames.push(maps.get(name)[i]); } currentstate=captureState(); } }
测试:
public void TestStandAnalyzer() { AnalyzerUtil auAnalyzerUtil=new AnalyzerUtil(); LinkedList<java.lang.String> stopList=new LinkedList<java.lang.String>(); stopList.add("this"); stopList.add("is"); stopList.add("a"); stopList.add("that"); stopList.add("like"); CharArraySet stopwordSet=new CharArraySet(Version.LUCENE_45,stopList, true); auAnalyzerUtil.analysis("This is a dog. That is a cat. wo like football but wo don't like " + "basketball. " +"zhongguo is greate country.", new MyAnalyzer(new CharArraySet(Version.LUCENE_45,stopwordSet, true))); }
输出:
[dog]:4 10 13 <ALPHANUM> 1 [cat]:4 25 28 <ALPHANUM> 1 [wo]:1 30 32 <ALPHANUM> 1 [me]:0 30 32 <ALPHANUM> 1 [zan]:0 30 32 <ALPHANUM> 1 [football]:2 38 46 <ALPHANUM> 1 [but]:1 47 50 <ALPHANUM> 1 [wo]:1 51 53 <ALPHANUM> 1 [me]:0 51 53 <ALPHANUM> 1 [zan]:0 51 53 <ALPHANUM> 1 [don't]:1 54 59 <ALPHANUM> 1 [basketball]:2 65 75 <ALPHANUM> 1 [zhongguo]:1 77 85 <ALPHANUM> 1 [tianchao]:0 77 85 <ALPHANUM> 1 [dalu]:0 77 85 <ALPHANUM> 1 [greate]:2 89 95 <ALPHANUM> 1 [country]:1 97 104 <ALPHANUM> 1
作者:keep_moving_cqu 发表于2013-12-29 1:40:36 原文链接
阅读:138 评论:0 查看评论