天天看點

Nutch內建Solr中文分詞Schema

 <?xml version="1.0" encoding="UTF-8" ?>

<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 

license agreements. See the NOTICE file distributed with this work for additional 

information regarding copyright ownership. The ASF licenses this file to 

You under the Apache License, Version 2.0 (the "License"); you may not use 

this file except in compliance with the License. You may obtain a copy of 

the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 

by applicable law or agreed to in writing, software distributed under the 

License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 

OF ANY KIND, either express or implied. See the License for the specific 

language governing permissions and limitations under the License. -->

<!-- Description: This document contains Solr 3.1 schema definition to be 

used with Solr integration currently build into Nutch. See https://issues.apache.org/jira/browse/NUTCH-442 

https://issues.apache.org/jira/browse/NUTCH-699 https://issues.apache.org/jira/browse/NUTCH-994 

https://issues.apache.org/jira/browse/NUTCH-997 and http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/ 

example/solr/conf/schema.xml?view=markup for more info. -->

<schema name="nutch" version="1.3">

<types>

<fieldType name="string" class="solr.StrField"

sortMissingLast="true" omitNorms="true" />

<fieldType name="long" class="solr.TrieLongField"

precisionStep="0" omitNorms="true" positionIncrementGap="0" />

<fieldType name="float" class="solr.TrieFloatField"

<fieldType name="date" class="solr.TrieDateField"

<fieldType name="cache_text" class="solr.TextField"

positionIncrementGap="100">

</fieldType>

<fieldType name="text" class="solr.TextField"

<analyzer type="index">

<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"

mode="complex" dicPath="dic" />

<filter class="solr.LowerCaseFilterFactory" />

</analyzer>

<analyzer type="query">

<fieldType name="url" class="solr.TextField"

<analyzer>

<tokenizer class="solr.StandardTokenizerFactory" />

<filter class="solr.WordDelimiterFilterFactory"

generateWordParts="1" generateNumberParts="1" />

</types>

<fields>

<field name="id" type="string" stored="true" indexed="true" />

<!-- core fields -->

<field name="segment" type="string" stored="true" indexed="false" />

<field name="digest" type="string" stored="true" indexed="false" />

<field name="boost" type="float" stored="true" indexed="false" />

<!-- fields for index-basic plugin -->

<field name="host" type="url" stored="false" indexed="true" />

<field name="site" type="string" stored="false" indexed="true" />

<field name="url" type="url" stored="true" indexed="true"

required="true" />

<field name="content" type="text" stored="false" indexed="true" />

<field name="title" type="text" stored="true" indexed="true" />

<field name="cache" type="string" stored="true" indexed="false" />

<field name="cache_content" type="cache_text" stored="true"

indexed="false" />

<field name="tstamp" type="date" stored="true" indexed="true" />

<!-- fields for index-anchor plugin -->

<field name="anchor" type="string" stored="true" indexed="true"

multiValued="true" />

<!-- fields for index-more plugin -->

<field name="type" type="string" stored="true" indexed="true"

<field name="contentLength" type="long" stored="true" indexed="false" />

<field name="lastModified" type="date" stored="true" indexed="false" />

<field name="date" type="date" stored="true" indexed="true" />

<!-- fields for languageidentifier plugin -->

<field name="lang" type="string" stored="true" indexed="true" />

<!-- fields for subcollection plugin -->

<field name="subcollection" type="string" stored="true" indexed="true"

<!-- fields for feed plugin (tag is also used by microformats-reltag) -->

<field name="author" type="string" stored="true" indexed="true" />

<field name="tag" type="string" stored="true" indexed="true"

<field name="feed" type="string" stored="true" indexed="true" />

<field name="publishedDate" type="date" stored="true" indexed="true" />

<field name="updatedDate" type="date" stored="true" indexed="true" />

<!-- fields for creativecommons plugin -->

<field name="cc" type="string" stored="true" indexed="true"

</fields>

<uniqueKey>id</uniqueKey>

<defaultSearchField>content</defaultSearchField>

<solrQueryParser defaultOperator="OR" />

</schema>

本文轉自william_xu 51CTO部落格,原文連結:http://blog.51cto.com/williamx/773815,如需轉載請自行聯系原作者