Lucene5學習之SpanQuery跨度查詢

spanquery下的子類有好幾個，我就放一篇裡集中說說。spanquery即跨度查詢，首先要了解跨度這個概念，lucene裡跨度是用spans這個類定義的，源碼如下：

/** expert: an enumeration of span matches. used to implement span searching.

* each span represents a range of term positions within a document. matches

* are enumerated in order, by increasing document number, within that by

* increasing start position and finally by increasing end position. */

public abstract class spans {

/** move to the next match, returning true iff any such exists. */

public abstract boolean next() throws ioexception;

/** skips to the first match beyond the current, whose document number is

* greater than or equal to target.

* the behavior of this method is undefined when called with

* <code> target &le; current</code>, or after the iterator has exhausted.

* both cases may result in unpredicted behavior.

* returns true iff there is such

* a match. behaves as if written: <pre class="prettyprint">

* boolean skipto(int target) {

* do {

* if (!next())

* return false;

* } while (target > doc());

* return true;

* }

* </pre>

* most implementations are considerably more efficient than that.

public abstract boolean skipto(int target) throws ioexception;

/** returns the document number of the current match. initially invalid. */

public abstract int doc();

/** returns the start position of the current match. initially invalid. */

public abstract int start();

/** returns the end position of the current match. initially invalid. */

public abstract int end();

/**

* returns the payload data for the current span.

* this is invalid until {@link #next()} is called for

* the first time.

* this method must not be called more than once after each call

* of {@link #next()}. however, most payloads are loaded lazily,

* so if the payload data for the current position is not needed,

* this method may not be called at all for performance reasons. an ordered

* spanquery does not lazy load, so if you have payloads in your index and

* you do not want ordered spannearquerys to collect payloads, you can

* disable collection with a constructor option.

*

* note that the return type is a collection, thus the ordering should not be relied upon.

*

* @lucene.experimental

* @return a list of byte arrays containing the data of this payload, otherwise null if ispayloadavailable is false

* @throws ioexception if there is a low-level i/o error

// todo: remove warning after api has been finalized

public abstract collection<byte[]> getpayload() throws ioexception;

* checks if a payload can be loaded at this position.

*

* payloads can only be loaded once per call to

* {@link #next()}.

* @return true if there is a payload available at this position that can be loaded

public abstract boolean ispayloadavailable() throws ioexception;

* returns the estimated cost of this spans.

*

* this is generally an upper bound of the number of documents this iterator

* might match, but may be a rough heuristic, hardcoded value, or otherwise

* completely inaccurate.

public abstract long cost();

}

跨度裡包含了比對term的起始位置和結束位置資訊以及跨度價值估算值以及payload資訊等等。

首先要說的就是spantermquery，他和termquery用法很相似，唯一差別就是sapntermquery可以得到term的span跨度資訊，用法如下：

package com.yida.framework.lucene5.query;

import java.io.ioexception;

import org.apache.lucene.analysis.analyzer;

import org.apache.lucene.analysis.standard.standardanalyzer;

import org.apache.lucene.document.document;

import org.apache.lucene.document.field;

import org.apache.lucene.document.textfield;

import org.apache.lucene.index.directoryreader;

import org.apache.lucene.index.indexreader;

import org.apache.lucene.index.indexwriter;

import org.apache.lucene.index.indexwriterconfig;

import org.apache.lucene.index.term;

import org.apache.lucene.index.indexwriterconfig.openmode;

import org.apache.lucene.search.automatonquery;

import org.apache.lucene.search.indexsearcher;

import org.apache.lucene.search.multitermquery;

import org.apache.lucene.search.scoredoc;

import org.apache.lucene.search.topdocs;

import org.apache.lucene.search.spans.spanquery;

import org.apache.lucene.search.spans.spantermquery;

import org.apache.lucene.store.directory;

import org.apache.lucene.store.ramdirectory;

import org.apache.lucene.util.automaton.automata;

import org.apache.lucene.util.automaton.automaton;

/**

* spantermquery用法測試

* @author lanxiaowei

public class spantermquerytest {

public static void main(string[] args) throws ioexception {

directory dir = new ramdirectory();

analyzer analyzer = new standardanalyzer();

indexwriterconfig iwc = new indexwriterconfig(analyzer);

iwc.setopenmode(openmode.create);

indexwriter writer = new indexwriter(dir, iwc);

document doc = new document();

doc.add(new textfield("text", "the quick brown fox jumps over the lazy dog", field.store.yes));

writer.adddocument(doc);

doc = new document();

doc.add(new textfield("text", "the quick red fox jumps over the sleepy cat", field.store.yes));

writer.close();

indexreader reader = directoryreader.open(dir);

indexsearcher searcher = new indexsearcher(reader);

string querystring = "red";

spanquery query = new spantermquery(new term("text",querystring));

topdocs results = searcher.search(query, null, 100);

scoredoc[] scoredocs = results.scoredocs;

for (int i = 0; i < scoredocs.length; ++i) {

//system.out.println(searcher.explain(query, scoredocs[i].doc));

int docid = scoredocs[i].doc;

document document = searcher.doc(docid);

string path = document.get("text");

system.out.println("text:" + path);

}

spannearquery：用來比對兩個term之間的跨度的，即一個term經過幾個跨度可以到達另一個term,slop為跨度因子，用來限制兩個term之間的最大跨度，不可能一個term和另一個term之間要經過十萬八千個跨度才到達也算兩者相近，這不符合常理。是以有個slop因子進行限制。還有一個inorder參數要引起注意，它用來設定是否允許進行倒序跨度，什麼意思？即terma到termb不一定是從左到右去比對也可以從右到左，而從右到左就是倒序，inorder為true即表示order(順序)很重要不能倒序去比對必須正向去比對，false則反之。注意停用詞不在slop統計範圍内。

slop的了解很重要：

在預設情況下slop的值是0, 就相當于termquery的精确比對, 通過設定slop參數(比如"one five"比對"one two three four five"就需要slop=3,如果slop=2就無法得到結果。這裡我們可以認為slope是單詞移動得次數，可以左移或者右移。這裡特别提醒,phrasequery不保證前後單詞的次序,在上面的例子中,"two one"就需要2個slop,也就是認為one 向左邊移動2位, 就是能夠比對的”one two”如果是“five three one” 就需要slope=6才能比對。

還有一個collectpayloads參數表示是否收集payload資訊，關于payload後面再單獨說。

spannearquery的構造函數如下：

public spannearquery(spanquery[] clauses, int slop, boolean inorder, boolean collectpayloads) {

// copy clauses array into an arraylist

this.clauses = new arraylist<>(clauses.length);

for (int i = 0; i < clauses.length; i++) {

spanquery clause = clauses[i];

if (field == null) { // check field

field = clause.getfield();

} else if (clause.getfield() != null && !clause.getfield().equals(field)) {

throw new illegalargumentexception("clauses must have same field.");

}

this.clauses.add(clause);

this.collectpayloads = collectpayloads;

this.slop = slop;

this.inorder = inorder;

}

spannearquery使用示例：

* spannearquery測試

public class spannearquerytest {

string querystringstart = "dog";

string querystringend = "quick";

spanquery querystart = new spantermquery(new term("text",querystringstart));

spanquery queryend = new spantermquery(new term("text",querystringend));

spanquery spannearquery = new spannearquery(

new spanquery[] {querystart,queryend}, 6, false, false);

topdocs results = searcher.search(spannearquery, null, 100);

示例中dog要到達quick需要經過6個跨度，需要從右至左倒序比對，是以inorder設定為false,如果設定為true會導緻查詢不出來資料。

spannotquery:使用場景是當使用spannearquery時，如果兩個term從terma到termb有多種情況，即可能出現terma或者termb在索引中重複出現，則可能有多種情況，spannotquery就是用來限制terma和termb之間不存在termc,進而排除一些情況，實作更精确的控制。預設spannotquery的構造函數是這樣的：

/** construct a spannotquery matching spans from <code>include</code> which

* have no overlap with spans from <code>exclude</code>.*/

public spannotquery(spanquery include, spanquery exclude) {

this(include, exclude, 0, 0);

顯然這裡的第一個參數include應該是spannearquery，第二個參數就是用來做排除的。

spannotquery另一個重載構造函數如下：

* have no overlap with spans from <code>exclude</code> within

* <code>dist</code> tokens of <code>include</code>. */

public spannotquery(spanquery include, spanquery exclude, int dist) {

this(include, exclude, dist, dist);

它多加了一個dist參數，官方的解釋是：construct a spannotquery matching spans from <code>include</code> which have no overlap with spans from <code>exclude</code> within <code>dist</code> tokens of <code>include</code>. 說白了就是，使用exclude限制以後比對到以後，terma和termb之間間隔的字元長度做個限制，這就是dist的作用。

spannotquery還有一個更複雜的構造函數重載：

* <code>pre</code> tokens before or <code>post</code> tokens of <code>include</code>. */

public spannotquery(spanquery include, spanquery exclude, int pre, int post) {

this.include = include;

this.exclude = exclude;

this.pre = (pre >=0) ? pre : 0;

this.post = (post >= 0) ? post : 0;

if (include.getfield() != null && exclude.getfield() != null && !include.getfield().equals(exclude.getfield()))

throw new illegalargumentexception("clauses must have same field.");

最後一個post參數其實就是dist，pre參數就是限制exclude term前面有幾個字元。這樣解釋太抽象，用示例代碼來說明吧：

import org.apache.lucene.search.spans.spannearquery;

import org.apache.lucene.search.spans.spannotquery;

* spannotquery測試

public class spannotquerytest {

doc.add(new textfield("text", "the quick brown fox quick gox jumps over the lazy dog", field.store.yes));

doc.add(new textfield("text", "the quick brown adult slave nice fox winde felt testcase gox quick jumps over the lazy dog", field.store.yes));

doc.add(new textfield("text", "the quick brown fox quick jumps over the lazy dog", field.store.yes));

string excludestring = "fox";

spanquery excludequery = new spantermquery(new term("text",excludestring));

new spanquery[] {querystart,queryend}, 12, false, false);

spannotquery spannotquery = new spannotquery(spannearquery, excludequery, 4,3);

topdocs results = searcher.search(spannotquery, null, 100);

示例代碼意思就是查詢dog和quick之間沒有fox的索引文檔，自己運作示例代碼參悟吧。

spanorquery顧名思義就是把多個span'query用or連接配接起來，其實你也可以用booleanquery來代替spanorquery,但spanorquery會傳回額外的span跨度資訊，它的構造函數如下：

spanorquery(spanquery... clauses)

接收多個spanquery對象并用or連接配接起來，下面是spanorquery示例代碼：

import org.apache.lucene.search.spans.spanorquery;

* spanorquery測試

public class spanorquerytest {

doc.add(new textfield("text", "the quick brown adult sick slave nice fox winde felt testcase fox quick jumps over the lazy dog", field.store.yes));

string termstring = "sick";

spanquery spantermquery = new spantermquery(new term("text",termstring));

spanorquery spanorquery = new spanorquery(spannotquery,spantermquery);

topdocs results = searcher.search(spanorquery, null, 100);

spanmultitermquerywrapper:就是一個query轉換器，用于把multitermquery包裝轉換成spanquery的，具體使用示例，我貼下官方api裡提供的示例代碼吧：

wildcardquery wildcard = new wildcardquery(new term("field", "bro?n"));

spanquery spanwildcard = new spanmultitermquerywrapper<wildcardquery>(wildcard);

spanpositionrangequery:這個query是用來限制比對的情況是否分布在(start,end)這個區間内，區間索引從零開始計算，拿示例代碼說話，

import org.apache.lucene.search.fuzzyquery;

import org.apache.lucene.search.spans.spanmultitermquerywrapper;

import org.apache.lucene.search.spans.spanpositionrangequery;

* spanpositionrangequery測試

public class spanpositionrangequerytest {

doc.add(new textfield("text", "quick brown fox", field.store.yes));

doc.add(new textfield("text", "jumps over lazy broun dog", field.store.yes));

doc.add(new textfield("text", "jumps over extremely very lazy broxn dog", field.store.yes));

fuzzyquery fq = new fuzzyquery(new term("text", "broan"));

spanquery sfq = new spanmultitermquerywrapper<fuzzyquery>(fq);

spanpositionrangequery spanpositionrangequery = new spanpositionrangequery(sfq, 3, 5);

topdocs results = searcher.search(spanpositionrangequery, null, 100);

稍微解釋下上面的代碼，首先呢，fuzzyquery fq = new fuzzyquery(new term("text", "broan"));用來查詢包含跟單詞broan相似字元的索引文檔，顯然第一個索引文檔不符合排除了一個，然後呢，我們new了一個spanquery包裝器wrapper，把fuzzyquery轉換成了spanquery,然後使用spanpositionrangequery對比對到的2種情況的落放的位置進行限制即跟broan相似的單詞必須分布在(3,5)這個區間内，顯然第3個索引文檔是分布在(3,6)這個區間内，是以第3個索引文檔被排除了，最後隻傳回第2個索引文檔。

spanpositionrangequery還有個子類spanfirstquery,其實spanfirstquery隻不過是把spanpositionrangequery構造函數裡的start參數值設定為0，僅此而已，是以不用多說，你也懂的，它的構造函數如下：

spanfirstquery(spanquery match, int end)

construct a spanfirstquery matching spans in match whose end position is less than or equal to end.

這也就是為什麼隻有一個end,沒有start,因為start預設為零，看源碼：

spanfirstquery示例我就不提供了，略過。

最後一個要說的就是fieldmaskingspanquery，它用于在多個域之間查詢，即把另一個域看作某個域，進而看起來就像在同一個域裡查詢，因為lucene預設某個條件隻能作用在單個域上，不支援跨域查詢隻能在同一個域裡查詢，是以有了fieldmaskingspanquery，，下面是示例代碼：

import org.apache.lucene.search.query;

import org.apache.lucene.search.spans.fieldmaskingspanquery;

* fieldmaskingspanquery測試

public class fieldmaskingspanquerytest {

doc.add(new field("teacherid", "1", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentfirstname", "james", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentsurname", "jones", field.store.yes, field.index.not_analyzed));

//teacher2

doc.add(new field("teacherid", "2", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentsurname", "smith", field.store.yes, field.index.not_analyzed));

doc.add(new field("studentfirstname", "sally", field.store.yes, field.index.not_analyzed));

spanquery q1 = new spantermquery(new term("studentfirstname", "james"));

spanquery q2 = new spantermquery(new term("studentsurname", "jones"));

spanquery q2m = new fieldmaskingspanquery(q2, "studentfirstname");

query query = new spannearquery(new spanquery[]{q1, q2m}, -1, false);

string teacherid = document.get("teacherid");

system.out.println("teacherid:" + teacherid);

ok，spanquery就說這麼多，接下來要說的就是phrasequery。

如果你還有什麼問題請加我Ｑ-q：7-3-6-0-3-1-3-0-5，

或者加裙

一起交流學習！

轉載：http://iamyida.iteye.com/blog/2195761

Lucene5學習之SpanQuery跨度查詢

繼續閱讀

關于Gradle配置的小結

Java小案例——随機數猜測随機數猜測

nginx location中斜線的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method