  • 浏览: 566262 次
  • 性别: Icon_minigender_1
  • 来自: 北京





/** Expert: an enumeration of span matches.  Used to implement span searching.
 * Each span represents a range of term positions within a document.  Matches
 * are enumerated in order, by increasing document number, within that by
 * increasing start position and finally by increasing end position. */
public abstract class Spans {
  /** Move to the next match, returning true iff any such exists. */
  public abstract boolean next() throws IOException;

  /** Skips to the first match beyond the current, whose document number is
   * greater than or equal to <i>target</i>.
   * <p>The behavior of this method is <b>undefined</b> when called with
   * <code> target &le; current</code>, or after the iterator has exhausted.
   * Both cases may result in unpredicted behavior.
   * <p>Returns true iff there is such
   * a match.  <p>Behaves as if written: <pre class="prettyprint">
   *   boolean skipTo(int target) {
   *     do {
   *       if (!next())
   *         return false;
   *     } while (target > doc());
   *     return true;
   *   }
   * </pre>
   * Most implementations are considerably more efficient than that.
  public abstract boolean skipTo(int target) throws IOException;

  /** Returns the document number of the current match.  Initially invalid. */
  public abstract int doc();

  /** Returns the start position of the current match.  Initially invalid. */
  public abstract int start();

  /** Returns the end position of the current match.  Initially invalid. */
  public abstract int end();
   * Returns the payload data for the current span.
   * This is invalid until {@link #next()} is called for
   * the first time.
   * This method must not be called more than once after each call
   * of {@link #next()}. However, most payloads are loaded lazily,
   * so if the payload data for the current position is not needed,
   * this method may not be called at all for performance reasons. An ordered
   * SpanQuery does not lazy load, so if you have payloads in your index and
   * you do not want ordered SpanNearQuerys to collect payloads, you can
   * disable collection with a constructor option.<br>
   * <br>
    * Note that the return type is a collection, thus the ordering should not be relied upon.
    * <br/>
   * @lucene.experimental
   * @return a List of byte arrays containing the data of this payload, otherwise null if isPayloadAvailable is false
   * @throws IOException if there is a low-level I/O error
  // TODO: Remove warning after API has been finalized
  public abstract Collection<byte[]> getPayload() throws IOException;

   * Checks if a payload can be loaded at this position.
   * <p/>
   * Payloads can only be loaded once per call to
   * {@link #next()}.
   * @return true if there is a payload available at this position that can be loaded
  public abstract boolean isPayloadAvailable() throws IOException;
   * Returns the estimated cost of this spans.
   * <p>
   * This is generally an upper bound of the number of documents this iterator
   * might match, but may be a rough heuristic, hardcoded value, or otherwise
   * completely inaccurate.
  public abstract long cost();





package com.yida.framework.lucene5.query;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
 * SpanTermQuery用法测试
 * @author Lanxiaowei
public class SpanTermQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick red fox jumps over the sleepy cat", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryString = "red";
        SpanQuery query = new SpanTermQuery(new Term("text",queryString));
        TopDocs results = searcher.search(query, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String path = document.get("text");
			System.out.println("text:" + path);



    在默认情况下slop的值是0, 就相当于TermQuery的精确匹配, 通过设置slop参数(比如"one five"匹配"one two three four five"就需要slop=3,如果slop=2就无法得到结果。这里我们可以认为slope是单词移动得次数,可以左移或者右移。这里特别提 醒,PhraseQuery不保证前后单词的次序,在上面的例子中,"two one"就需要2个slop,也就是认为one 向左边移动2位, 就是能够匹配的”one two”如果是“five three one” 就需要slope=6才能匹配。






public SpanNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, boolean collectPayloads) {

    // copy clauses array into an ArrayList
    this.clauses = new ArrayList<>(clauses.length);
    for (int i = 0; i < clauses.length; i++) {
      SpanQuery clause = clauses[i];
      if (field == null) {                               // check field
        field = clause.getField();
      } else if (clause.getField() != null && !clause.getField().equals(field)) {
        throw new IllegalArgumentException("Clauses must have same field.");
    this.collectPayloads = collectPayloads;
    this.slop = slop;
    this.inOrder = inOrder;




 * SpanNearQuery测试
 * @author Lanxiaowei
public class SpanNearQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick red fox jumps over the sleepy cat", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryStringStart = "dog";
        String queryStringEnd = "quick";
        SpanQuery queryStart = new SpanTermQuery(new Term("text",queryStringStart));
        SpanQuery queryEnd = new SpanTermQuery(new Term("text",queryStringEnd));
        SpanQuery spanNearQuery = new SpanNearQuery(
            new SpanQuery[] {queryStart,queryEnd}, 6, false, false);
        TopDocs results = searcher.search(spanNearQuery, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String path = document.get("text");
			System.out.println("text:" + path);





/** Construct a SpanNotQuery matching spans from <code>include</code> which
   * have no overlap with spans from <code>exclude</code>.*/
  public SpanNotQuery(SpanQuery include, SpanQuery exclude) {
     this(include, exclude, 0, 0);





/** Construct a SpanNotQuery matching spans from <code>include</code> which
   * have no overlap with spans from <code>exclude</code> within 
   * <code>dist</code> tokens of <code>include</code>. */
  public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) {
     this(include, exclude, dist, dist);

 它多加了一个dist参数,官方的解释是:Construct a SpanNotQuery matching spans from include which have no overlap with spans from exclude within dist tokens of include. 说白了就是,使用exclude限制以后匹配到以后,TermA和TermB之间间隔的字符长度做个限制,这就是dist的作用。




/** Construct a SpanNotQuery matching spans from <code>include</code> which
   * have no overlap with spans from <code>exclude</code> within 
   * <code>pre</code> tokens before or <code>post</code> tokens of <code>include</code>. */
  public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) {
    this.include = include;
    this.exclude = exclude;
    this.pre = (pre >=0) ? pre : 0;
    this.post = (post >= 0) ? post : 0;

    if (include.getField() != null && exclude.getField() != null && !include.getField().equals(exclude.getField()))
      throw new IllegalArgumentException("Clauses must have same field.");

 最后一个post参数其实就是dist,pre参数就是限制exclude Term前面有几个字符。这样解释太抽象,用示例代码来说明吧:



package com.yida.framework.lucene5.query;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

 * SpanNotQuery测试
 * @author Lanxiaowei
public class SpanNotQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick red fox jumps over the sleepy cat", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox quick gox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown adult slave nice fox winde felt testcase gox quick jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox quick jumps over the lazy dog", Field.Store.YES));

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryStringStart = "dog";
        String queryStringEnd = "quick";
        String excludeString = "fox";
        SpanQuery queryStart = new SpanTermQuery(new Term("text",queryStringStart));
        SpanQuery queryEnd = new SpanTermQuery(new Term("text",queryStringEnd));
        SpanQuery excludeQuery = new SpanTermQuery(new Term("text",excludeString));
        SpanQuery spanNearQuery = new SpanNearQuery(
            new SpanQuery[] {queryStart,queryEnd}, 12, false, false);
        SpanNotQuery spanNotQuery = new SpanNotQuery(spanNearQuery, excludeQuery, 4,3);
        TopDocs results = searcher.search(spanNotQuery, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String path = document.get("text");
			System.out.println("text:" + path);





SpanOrQuery(SpanQuery... clauses) 




package com.yida.framework.lucene5.query;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

 * SpanOrQuery测试
 * @author Lanxiaowei
public class SpanOrQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();
        doc.add(new TextField("text", "the quick brown fox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick red fox jumps over the sleepy cat", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox quick gox jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown adult slave nice fox winde felt testcase gox quick jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown adult sick slave nice fox winde felt testcase fox quick jumps over the lazy dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "the quick brown fox quick jumps over the lazy dog", Field.Store.YES));

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        String queryStringStart = "dog";
        String queryStringEnd = "quick";
        String excludeString = "fox";
        String termString = "sick";
        SpanQuery queryStart = new SpanTermQuery(new Term("text",queryStringStart));
        SpanQuery queryEnd = new SpanTermQuery(new Term("text",queryStringEnd));
        SpanQuery excludeQuery = new SpanTermQuery(new Term("text",excludeString));
        SpanQuery spanNearQuery = new SpanNearQuery(
            new SpanQuery[] {queryStart,queryEnd}, 12, false, false);
        SpanNotQuery spanNotQuery = new SpanNotQuery(spanNearQuery, excludeQuery, 4,3);
        SpanQuery spanTermQuery = new SpanTermQuery(new Term("text",termString));
        SpanOrQuery spanOrQuery = new SpanOrQuery(spanNotQuery,spanTermQuery);
        TopDocs results = searcher.search(spanOrQuery, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String path = document.get("text");
			System.out.println("text:" + path);




WildcardQuery wildcard = new WildcardQuery(new Term("field", "bro?n"));
 SpanQuery spanWildcard = new SpanMultiTermQueryWrapper<WildcardQuery>(wildcard);




package com.yida.framework.lucene5.query;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanPositionRangeQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

 * SpanPositionRangeQuery测试
 * @author Lanxiaowei
public class SpanPositionRangeQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();
        doc.add(new TextField("text", "quick brown fox", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "jumps over lazy broun dog", Field.Store.YES));
        doc = new Document();
        doc.add(new TextField("text", "jumps over extremely very lazy broxn dog", Field.Store.YES));

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        FuzzyQuery fq = new FuzzyQuery(new Term("text", "broan"));
        SpanQuery sfq = new SpanMultiTermQueryWrapper<FuzzyQuery>(fq);
        SpanPositionRangeQuery spanPositionRangeQuery = new SpanPositionRangeQuery(sfq, 3, 5);
        TopDocs results = searcher.search(spanPositionRangeQuery, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String path = document.get("text");
			System.out.println("text:" + path);

 稍微解释下上面的代码,首先呢,FuzzyQuery fq = new FuzzyQuery(new Term("text", "broan"));用来查询包含跟单词broan相似字符的索引文档,显然第一个索引文档不符合排除了一个,然后呢,我们new了一个SpanQuery包装器Wrapper,把FuzzyQuery转换成了SpanQuery,然后使用SpanPositionRangeQuery对匹配到的2种情况的落放的位置进行限制即跟broan相似的单词必须分布在(3,5)这个区间内,显然第3个索引文档是分布在(3,6)这个区间内,所以第3个索引文档被排除了,最后只返回第2个索引文档。



SpanFirstQuery(SpanQuery match, int end) 
Construct a SpanFirstQuery matching spans in match whose end position is less than or equal to end.




package com.yida.framework.lucene5.query;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

 * FieldMaskingSpanQuery测试
 * @author Lanxiaowei
public class FieldMaskingSpanQueryTest {
	public static void main(String[] args) throws IOException {
		Directory dir = new RAMDirectory();
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(dir, iwc);

        Document doc = new Document();

        doc.add(new Field("teacherid", "1", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("studentfirstname", "james", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("studentsurname", "jones", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc = new Document();

        doc.add(new Field("teacherid", "2", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("studentfirstname", "james", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("studentsurname", "smith", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("studentfirstname", "sally", Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("studentsurname", "jones", Field.Store.YES, Field.Index.NOT_ANALYZED));


        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
        SpanQuery q1  = new SpanTermQuery(new Term("studentfirstname", "james"));
        SpanQuery q2  = new SpanTermQuery(new Term("studentsurname", "jones"));
        SpanQuery q2m = new FieldMaskingSpanQuery(q2, "studentfirstname");

        Query query = new SpanNearQuery(new SpanQuery[]{q1, q2m}, -1, false);
        TopDocs results = searcher.search(query, null, 100);
        ScoreDoc[] scoreDocs = results.scoreDocs;
        for (int i = 0; i < scoreDocs.length; ++i) {
            //System.out.println(searcher.explain(query, scoreDocs[i].doc));
        	int docID = scoreDocs[i].doc;
			Document document = searcher.doc(docID);
			String teacherid = document.get("teacherid");
			System.out.println("teacherid:" + teacherid);





  • 大小: 251.9 KB
1 楼 oaibf 2015-05-27  


Global site tag (gtag.js) - Google Analytics