Lucene5学习之SpanQuery跨度查询

spanquery下的子类有好几个，我就放一篇里集中说说。spanquery即跨度查询，首先要理解跨度这个概念，lucene里跨度是用spans这个类定义的，源码如下：

/** expert: an enumeration of span matches. used to implement span searching.

* each span represents a range of term positions within a document. matches

* are enumerated in order, by increasing document number, within that by

* increasing start position and finally by increasing end position. */

public abstract class spans {

/** move to the next match, returning true iff any such exists. */

public abstract boolean next() throws ioexception;

/** skips to the first match beyond the current, whose document number is

* greater than or equal to target.

* the behavior of this method is undefined when called with

* <code> target &le; current</code>, or after the iterator has exhausted.

* both cases may result in unpredicted behavior.

* returns true iff there is such

* a match. behaves as if written: <pre class="prettyprint">

* boolean skipto(int target) {

* do {

* if (!next())

* return false;

* } while (target > doc());

* return true;

* }

* </pre>

* most implementations are considerably more efficient than that.

public abstract boolean skipto(int target) throws ioexception;

/** returns the document number of the current match. initially invalid. */

public abstract int doc();

/** returns the start position of the current match. initially invalid. */

public abstract int start();

/** returns the end position of the current match. initially invalid. */

public abstract int end();

/**

* returns the payload data for the current span.

* this is invalid until {@link #next()} is called for

* the first time.

* this method must not be called more than once after each call

* of {@link #next()}. however, most payloads are loaded lazily,

* so if the payload data for the current position is not needed,

* this method may not be called at all for performance reasons. an ordered

* spanquery does not lazy load, so if you have payloads in your index and

* you do not want ordered spannearquerys to collect payloads, you can

* disable collection with a constructor option.

*

* note that the return type is a collection, thus the ordering should not be relied upon.

*

* @lucene.experimental

* @return a list of byte arrays containing the data of this payload, otherwise null if ispayloadavailable is false

* @throws ioexception if there is a low-level i/o error

// todo: remove warning after api has been finalized

public abstract collection<byte[]> getpayload() throws ioexception;

* checks if a payload can be loaded at this position.

*

* payloads can only be loaded once per call to

* {@link #next()}.

* @return true if there is a payload available at this position that can be loaded

public abstract boolean ispayloadavailable() throws ioexception;

* returns the estimated cost of this spans.

*

* this is generally an upper bound of the number of documents this iterator

* might match, but may be a rough heuristic, hardcoded value, or otherwise

* completely inaccurate.

public abstract long cost();

}

跨度里包含了匹配term的起始位置和结束位置信息以及跨度价值估算值以及payload信息等等。

首先要说的就是spantermquery，他和termquery用法很相似，唯一区别就是sapntermquery可以得到term的span跨度信息，用法如下：

package com.yida.framework.lucene5.query;

import java.io.ioexception;

import org.apache.lucene.analysis.analyzer;

import org.apache.lucene.analysis.standard.standardanalyzer;

import org.apache.lucene.document.document;

import org.apache.lucene.document.field;

import org.apache.lucene.document.textfield;

import org.apache.lucene.index.directoryreader;

import org.apache.lucene.index.indexreader;

import org.apache.lucene.index.indexwriter;

import org.apache.lucene.index.indexwriterconfig;

import org.apache.lucene.index.term;

import org.apache.lucene.index.indexwriterconfig.openmode;

import org.apache.lucene.search.automatonquery;

import org.apache.lucene.search.indexsearcher;

import org.apache.lucene.search.multitermquery;

import org.apache.lucene.search.scoredoc;

import org.apache.lucene.search.topdocs;

import org.apache.lucene.search.spans.spanquery;

import org.apache.lucene.search.spans.spantermquery;

import org.apache.lucene.store.directory;

import org.apache.lucene.store.ramdirectory;

import org.apache.lucene.util.automaton.automata;

import org.apache.lucene.util.automaton.automaton;

/**

* spantermquery用法测试

* @author lanxiaowei

public class spantermquerytest {

public static void main(string[] args) throws ioexception {

directory dir = new ramdirectory();

analyzer analyzer = new standardanalyzer();

indexwriterconfig iwc = new indexwriterconfig(analyzer);

iwc.setopenmode(openmode.create);

indexwriter writer = new indexwriter(dir, iwc);

document doc = new document();

doc.add(new textfield("text", "the quick brown fox jumps over the lazy dog", field.store.yes));

writer.adddocument(doc);

doc = new document();

doc.add(new textfield("text", "the quick red fox jumps over the sleepy cat", field.store.yes));

writer.close();

indexreader reader = directoryreader.open(dir);

indexsearcher searcher = new indexsearcher(reader);

string querystring = "red";

spanquery query = new spantermquery(new term("text",querystring));

topdocs results = searcher.search(query, null, 100);

scoredoc[] scoredocs = results.scoredocs;

for (int i = 0; i < scoredocs.length; ++i) {

//system.out.println(searcher.explain(query, scoredocs[i].doc));

int docid = scoredocs[i].doc;

document document = searcher.doc(docid);

string path = document.get("text");

system.out.println("text:" + path);

}

spannearquery：用来匹配两个term之间的跨度的，即一个term经过几个跨度可以到达另一个term,slop为跨度因子，用来限制两个term之间的最大跨度，不可能一个term和另一个term之间要经过十万八千个跨度才到达也算两者相近，这不符合常理。所以有个slop因子进行限制。还有一个inorder参数要引起注意，它用来设置是否允许进行倒序跨度，什么意思？即terma到termb不一定是从左到右去匹配也可以从右到左，而从右到左就是倒序，inorder为true即表示order(顺序)很重要不能倒序去匹配必须正向去匹配，false则反之。注意停用词不在slop统计范围内。

slop的理解很重要：

在默认情况下slop的值是0, 就相当于termquery的精确匹配, 通过设置slop参数(比如"one five"匹配"one two three four five"就需要slop=3,如果slop=2就无法得到结果。这里我们可以认为slope是单词移动得次数，可以左移或者右移。这里特别提醒,phrasequery不保证前后单词的次序,在上面的例子中,"two one"就需要2个slop,也就是认为one 向左边移动2位, 就是能够匹配的”one two”如果是“five three one” 就需要slope=6才能匹配。

还有一个collectpayloads参数表示是否收集payload信息，关于payload后面再单独说。

spannearquery的构造函数如下：

public spannearquery(spanquery[] clauses, int slop, boolean inorder, boolean collectpayloads) {

// copy clauses array into an arraylist

this.clauses = new arraylist<>(clauses.length);

for (int i = 0; i < clauses.length; i++) {

spanquery clause = clauses[i];

if (field == null) { // check field

field = clause.getfield();

} else if (clause.getfield() != null && !clause.getfield().equals(field)) {

throw new illegalargumentexception("clauses must have same field.");

}

this.clauses.add(clause);

this.collectpayloads = collectpayloads;

this.slop = slop;

this.inorder = inorder;

}

spannearquery使用示例：

* spannearquery测试

public class spannearquerytest {

string querystringstart = "dog";

string querystringend = "quick";

spanquery querystart = new spantermquery(new term("text",querystringstart));

spanquery queryend = new spantermquery(new term("text",querystringend));

spanquery spannearquery = new spannearquery(

new spanquery[] {querystart,queryend}, 6, false, false);

topdocs results = searcher.search(spannearquery, null, 100);

示例中dog要到达quick需要经过6个跨度，需要从右至左倒序匹配，所以inorder设置为false,如果设置为true会导致查询不出来数据。

spannotquery:使用场景是当使用spannearquery时，如果两个term从terma到termb有多种情况，即可能出现terma或者termb在索引中重复出现，则可能有多种情况，spannotquery就是用来限制terma和termb之间不存在termc,从而排除一些情况，实现更精确的控制。默认spannotquery的构造函数是这样的：

/** construct a spannotquery matching spans from <code>include</code> which

* have no overlap with spans from <code>exclude</code>.*/

public spannotquery(spanquery include, spanquery exclude) {

this(include, exclude, 0, 0);

显然这里的第一个参数include应该是spannearquery，第二个参数就是用来做排除的。

spannotquery另一个重载构造函数如下：

* have no overlap with spans from <code>exclude</code> within

* <code>dist</code> tokens of <code>include</code>. */

public spannotquery(spanquery include, spanquery exclude, int dist) {

this(include, exclude, dist, dist);

它多加了一个dist参数，官方的解释是：construct a spannotquery matching spans from <code>include</code> which have no overlap with spans from <code>exclude</code> within <code>dist</code> tokens of <code>include</code>. 说白了就是，使用exclude限制以后匹配到以后，terma和termb之间间隔的字符长度做个限制，这就是dist的作用。

spannotquery还有一个更复杂的构造函数重载：

* <code>pre</code> tokens before or <code>post</code> tokens of <code>include</code>. */

public spannotquery(spanquery include, spanquery exclude, int pre, int post) {

this.include = include;

this.exclude = exclude;

this.pre = (pre >=0) ? pre : 0;

this.post = (post >= 0) ? post : 0;

if (include.getfield() != null && exclude.getfield() != null && !include.getfield().equals(exclude.getfield()))

throw new illegalargumentexception("clauses must have same field.");

最后一个post参数其实就是dist，pre参数就是限制exclude term前面有几个字符。这样解释太抽象，用示例代码来说明吧：

import org.apache.lucene.search.spans.spannearquery;

import org.apache.lucene.search.spans.spannotquery;

* spannotquery测试

public class spannotquerytest {

doc.add(new textfield("text", "the quick brown fox quick gox jumps over the lazy dog", field.store.yes));

doc.add(new textfield("text", "the quick brown adult slave nice fox winde felt testcase gox quick jumps over the lazy dog", field.store.yes));

doc.add(new textfield("text", "the quick brown fox quick jumps over the lazy dog", field.store.yes));

string excludestring = "fox";

spanquery excludequery = new spantermquery(new term("text",excludestring));

new spanquery[] {querystart,queryend}, 12, false, false);

spannotquery spannotquery = new spannotquery(spannearquery, excludequery, 4,3);

topdocs results = searcher.search(spannotquery, null, 100);

示例代码意思就是查询dog和quick之间没有fox的索引文档，自己运行示例代码参悟吧。

spanorquery顾名思义就是把多个span'query用or连接起来，其实你也可以用booleanquery来代替spanorquery,但spanorquery会返回额外的span跨度信息，它的构造函数如下：

spanorquery(spanquery... clauses)

接收多个spanquery对象并用or连接起来，下面是spanorquery示例代码：

import org.apache.lucene.search.spans.spanorquery;

* spanorquery测试

public class spanorquerytest {

doc.add(new textfield("text", "the quick brown adult sick slave nice fox winde felt testcase fox quick jumps over the lazy dog", field.store.yes));

string termstring = "sick";

spanquery spantermquery = new spantermquery(new term("text",termstring));

spanorquery spanorquery = new spanorquery(spannotquery,spantermquery);

topdocs results = searcher.search(spanorquery, null, 100);

spanmultitermquerywrapper:就是一个query转换器，用于把multitermquery包装转换成spanquery的，具体使用示例，我贴下官方api里提供的示例代码吧：

wildcardquery wildcard = new wildcardquery(new term("field", "bro?n"));

spanquery spanwildcard = new spanmultitermquerywrapper<wildcardquery>(wildcard);

spanpositionrangequery:这个query是用来限制匹配的情况是否分布在(start,end)这个区间内，区间索引从零开始计算，拿示例代码说话，

import org.apache.lucene.search.fuzzyquery;

import org.apache.lucene.search.spans.spanmultitermquerywrapper;

import org.apache.lucene.search.spans.spanpositionrangequery;

* spanpositionrangequery测试

public class spanpositionrangequerytest {

doc.add(new textfield("text", "quick brown fox", field.store.yes));

doc.add(new textfield("text", "jumps over lazy broun dog", field.store.yes));

doc.add(new textfield("text", "jumps over extremely very lazy broxn dog", field.store.yes));

fuzzyquery fq = new fuzzyquery(new term("text", "broan"));

spanquery sfq = new spanmultitermquerywrapper<fuzzyquery>(fq);

spanpositionrangequery spanpositionrangequery = new spanpositionrangequery(sfq, 3, 5);

topdocs results = searcher.search(spanpositionrangequery, null, 100);

稍微解释下上面的代码，首先呢，fuzzyquery fq = new fuzzyquery(new term("text", "broan"));用来查询包含跟单词broan相似字符的索引文档，显然第一个索引文档不符合排除了一个，然后呢，我们new了一个spanquery包装器wrapper，把fuzzyquery转换成了spanquery,然后使用spanpositionrangequery对匹配到的2种情况的落放的位置进行限制即跟broan相似的单词必须分布在(3,5)这个区间内，显然第3个索引文档是分布在(3,6)这个区间内，所以第3个索引文档被排除了，最后只返回第2个索引文档。

spanpositionrangequery还有个子类spanfirstquery,其实spanfirstquery只不过是把spanpositionrangequery构造函数里的start参数值设置为0，仅此而已，所以不用多说，你也懂的，它的构造函数如下：

spanfirstquery(spanquery match, int end)

construct a spanfirstquery matching spans in match whose end position is less than or equal to end.

这也就是为什么只有一个end,没有start,因为start默认为零，看源码：

spanfirstquery示例我就不提供了，略过。

最后一个要说的就是fieldmaskingspanquery，它用于在多个域之间查询，即把另一个域看作某个域，从而看起来就像在同一个域里查询，因为lucene默认某个条件只能作用在单个域上，不支持跨域查询只能在同一个域里查询，所以有了fieldmaskingspanquery，，下面是示例代码：

import org.apache.lucene.search.query;

import org.apache.lucene.search.spans.fieldmaskingspanquery;

* fieldmaskingspanquery测试

public class fieldmaskingspanquerytest {

doc.add(new field("teacherid", "1", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentfirstname", "james", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentsurname", "jones", field.store.yes, field.index.not_analyzed));

//teacher2

doc.add(new field("teacherid", "2", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentsurname", "smith", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentfirstname", "sally", field.store.yes, field.index.not_analyzed));

spanquery q1 = new spantermquery(new term("studentfirstname", "james"));

spanquery q2 = new spantermquery(new term("studentsurname", "jones"));

spanquery q2m = new fieldmaskingspanquery(q2, "studentfirstname");

query query = new spannearquery(new spanquery[]{q1, q2m}, -1, false);

string teacherid = document.get("teacherid");

system.out.println("teacherid:" + teacherid);

ok，spanquery就说这么多，接下来要说的就是phrasequery。

如果你还有什么问题请加我Ｑ-q：7-3-6-0-3-1-3-0-5，

或者加裙

一起交流学习！

转载：http://iamyida.iteye.com/blog/2195761

Lucene5学习之SpanQuery跨度查询

继续阅读

关于Gradle配置的小结

Java小案例——随机数猜测随机数猜测

nginx location中斜线的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的简单使用

neo4j之cypher使用文档

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

spark/scala关于【资源文件】加载方法概述外部文件加载方案测试资源文件打包入jar包中小结

mybatis_入门程序Mybatis入门

AOP编程_Android优雅权限框架(1)概念基础，2021金三银四前言正文大纲正文

Effective Java 8:通用程序设计

OOM三种类型

工厂模式-三种类型

【递归】高效率求2的n次幂

win10本地scala和spark安装安装scala安装spark

scala (3) Function 和 Method