Quantcast
Channel: CSDN博客推荐文章
Viewing all articles
Browse latest Browse all 35570

lucene4.5.0之同义词实现

$
0
0
public class MyAnalyzer extends Analyzer {
	private CharArraySet stopsArraySet=null;

	public MyAnalyzer() {
		super();
		// TODO Auto-generated constructor stub
	}

	public MyAnalyzer(CharArraySet stopsArraySet) {
		super();
		this.stopsArraySet = stopsArraySet;
	}
	
	@Override
	protected TokenStreamComponents createComponents(String fieldName,
			Reader reader) {
		
		StandardTokenizer srcStandardTokenizer=new StandardTokenizer(Version.LUCENE_45, reader);
		TokenStream tokenStream=new StandardFilter(Version.LUCENE_45, srcStandardTokenizer);
		tokenStream=new LowerCaseFilter(Version.LUCENE_45, tokenStream);
		tokenStream=new StopFilter(Version.LUCENE_45, tokenStream,stopsArraySet);
		tokenStream=new MyTokenFilter(tokenStream);
		return new TokenStreamComponents(srcStandardTokenizer, tokenStream);
	}
}

 

public void analysis( String str,Analyzer analyzer)
	{
		
		try {
			ts=analyzer.tokenStream("content", new StringReader(str));
			CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);
			PositionIncrementAttribute pia=ts.addAttribute(PositionIncrementAttribute.class);
			OffsetAttribute oa=ts.addAttribute(OffsetAttribute.class);
			TypeAttribute ta=ts.addAttribute(TypeAttribute.class);
			PositionLengthAttribute positionLengthAttribute=ts.addAttribute(PositionLengthAttribute.class);

			ts.reset();

			while(ts.incrementToken())
			{
				System.out.println("["+cta+"]:"+pia.getPositionIncrement()+" "
						+oa.startOffset()+" "+oa.endOffset()+" "+ta.type()+" "+
						positionLengthAttribute.getPositionLength());
			}
			ts.end();
			
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			if(ts!=null)
				try {
					ts.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}
	}


 

public final class MyTokenFilter extends TokenFilter {
	
	private CharTermAttribute charTermAttribute;
	private PositionIncrementAttribute positionIncrementAttribute;
	private State currentstate;
	private Stack<String> sames=new Stack<String>();
	protected MyTokenFilter(TokenStream input) {
		super(input);
		charTermAttribute=input.addAttribute(CharTermAttribute.class);
		positionIncrementAttribute=input.addAttribute(PositionIncrementAttribute.class);
		// TODO Auto-generated constructor stub
	}

	@Override
	public boolean incrementToken() throws IOException {
		// TODO Auto-generated method stub
		
		if(sames.size()>0)
		{
			restoreState(currentstate);
			charTermAttribute.setEmpty();
			String top=sames.pop();
			charTermAttribute.append(top);
			positionIncrementAttribute.setPositionIncrement(0);
			return true;
		}
		
		if(!input.incrementToken())
			return false;
		
		getsamewords(charTermAttribute.toString());
		return true;
	}
	
	public void getsamewords(String name)
	{
		Map<String,String[]> maps=new HashMap<String, String[]>();
		maps.put("zhongguo",new String[]{"dalu","tianchao"});
		maps.put("wo", new String[]{"zan","me"});
		
		String[] searchStrings=maps.get(name);
		if(searchStrings==null)
			return;
		for(int i=0;i<maps.get(name).length;i++)
		{
			sames.push(maps.get(name)[i]);
		}
		currentstate=captureState();
	}
}

 

测试:

public void TestStandAnalyzer()
	{
		AnalyzerUtil auAnalyzerUtil=new AnalyzerUtil();
		LinkedList<java.lang.String> stopList=new LinkedList<java.lang.String>();
		stopList.add("this");
		stopList.add("is");
		stopList.add("a");
		stopList.add("that");
		stopList.add("like");
		CharArraySet stopwordSet=new CharArraySet(Version.LUCENE_45,stopList, true);
		auAnalyzerUtil.analysis("This is a dog. That is a cat. wo like football but wo don't like " +
				"basketball. " +"zhongguo is greate  country.", 
				new MyAnalyzer(new CharArraySet(Version.LUCENE_45,stopwordSet, true)));		
	}


输出:

[dog]:4 10 13 <ALPHANUM> 1
[cat]:4 25 28 <ALPHANUM> 1
[wo]:1 30 32 <ALPHANUM> 1
[me]:0 30 32 <ALPHANUM> 1
[zan]:0 30 32 <ALPHANUM> 1
[football]:2 38 46 <ALPHANUM> 1
[but]:1 47 50 <ALPHANUM> 1
[wo]:1 51 53 <ALPHANUM> 1
[me]:0 51 53 <ALPHANUM> 1
[zan]:0 51 53 <ALPHANUM> 1
[don't]:1 54 59 <ALPHANUM> 1
[basketball]:2 65 75 <ALPHANUM> 1
[zhongguo]:1 77 85 <ALPHANUM> 1
[tianchao]:0 77 85 <ALPHANUM> 1
[dalu]:0 77 85 <ALPHANUM> 1
[greate]:2 89 95 <ALPHANUM> 1
[country]:1 97 104 <ALPHANUM> 1


 

作者:keep_moving_cqu 发表于2013-12-29 1:40:36 原文链接
阅读:138 评论:0 查看评论

Viewing all articles
Browse latest Browse all 35570

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>