1 | <?xml version="1.0" encoding="UTF-8" ?> |
---|
2 | <!-- |
---|
3 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
4 | contributor license agreements. See the NOTICE file distributed with |
---|
5 | this work for additional information regarding copyright ownership. |
---|
6 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
7 | (the "License"); you may not use this file except in compliance with |
---|
8 | the License. You may obtain a copy of the License at |
---|
9 | |
---|
10 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | |
---|
12 | Unless required by applicable law or agreed to in writing, software |
---|
13 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | See the License for the specific language governing permissions and |
---|
16 | limitations under the License. |
---|
17 | --> |
---|
18 | |
---|
19 | <!-- |
---|
20 | This is the Solr schema file. This file should be named "schema.xml" and |
---|
21 | should be in the conf directory under the solr home |
---|
22 | (i.e. ./solr/conf/schema.xml by default) |
---|
23 | or located where the classloader for the Solr webapp can find it. |
---|
24 | |
---|
25 | This example schema is the recommended starting point for users. |
---|
26 | It should be kept correct and concise, usable out-of-the-box. |
---|
27 | |
---|
28 | For more information, on how to customize this file, please see |
---|
29 | http://wiki.apache.org/solr/SchemaXml |
---|
30 | |
---|
31 | PERFORMANCE NOTE: this schema includes many optional features and should not |
---|
32 | be used for benchmarking. To improve performance one could |
---|
33 | - set stored="false" for all fields possible (esp large fields) when you |
---|
34 | only need to search on the field but don't need to return the original |
---|
35 | value. |
---|
36 | - set indexed="false" if you don't need to search on the field, but only |
---|
37 | return the field as a result of searching on other indexed fields. |
---|
38 | - remove all unneeded copyField statements |
---|
39 | - for best index size and searching performance, set "index" to false |
---|
40 | for all general text fields, use copyField to copy them to the |
---|
41 | catchall "text" field, and use that for searching. |
---|
42 | - For maximum indexing performance, use the StreamingUpdateSolrServer |
---|
43 | java client. |
---|
44 | - Remember to run the JVM in server mode, and use a higher logging level |
---|
45 | that avoids logging every request |
---|
46 | --> |
---|
47 | |
---|
48 | <schema name="example" version="1.5"> |
---|
49 | <!-- attribute "name" is the name of this schema and is only used for display purposes. |
---|
50 | version="x.y" is Solr's version number for the schema syntax and semantics. It should |
---|
51 | not normally be changed by applications. |
---|
52 | 1.0: multiValued attribute did not exist, all fields are multiValued by nature |
---|
53 | 1.1: multiValued attribute introduced, false by default |
---|
54 | 1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields. |
---|
55 | 1.3: removed optional field compress feature |
---|
56 | 1.4: default auto-phrase (QueryParser feature) to off |
---|
57 | 1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...) |
---|
58 | --> |
---|
59 | |
---|
60 | <fields> |
---|
61 | <!-- Valid attributes for fields: |
---|
62 | name: mandatory - the name for the field |
---|
63 | type: mandatory - the name of a field type from the |
---|
64 | <types> fieldType section |
---|
65 | indexed: true if this field should be indexed (searchable or sortable) |
---|
66 | stored: true if this field should be retrievable |
---|
67 | multiValued: true if this field may contain multiple values per document |
---|
68 | omitNorms: (expert) set to true to omit the norms associated with |
---|
69 | this field (this disables length normalization and index-time |
---|
70 | boosting for the field, and saves some memory). Only full-text |
---|
71 | fields or fields that need an index-time boost need norms. |
---|
72 | Norms are omitted for primitive (non-analyzed) types by default. |
---|
73 | termVectors: [false] set to true to store the term vector for a |
---|
74 | given field. |
---|
75 | When using MoreLikeThis, fields used for similarity should be |
---|
76 | stored for best performance. |
---|
77 | termPositions: Store position information with the term vector. |
---|
78 | This will increase storage costs. |
---|
79 | termOffsets: Store offset information with the term vector. This |
---|
80 | will increase storage costs. |
---|
81 | required: The field is required. It will throw an error if the |
---|
82 | value does not exist |
---|
83 | default: a value that should be used if no value is specified |
---|
84 | when adding a document. |
---|
85 | --> |
---|
86 | |
---|
87 | <!-- field names should consist of alphanumeric or underscore characters only and |
---|
88 | not start with a digit. This is not currently strictly enforced, |
---|
89 | but other field names will not have first class support from all components |
---|
90 | and back compatibility is not guaranteed. Names with both leading and |
---|
91 | trailing underscores (e.g. _version_) are reserved. |
---|
92 | --> |
---|
93 | |
---|
94 | <field name="id" type="string" indexed="true" stored="true" required="true" /> |
---|
95 | <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/> |
---|
96 | <field name="name" type="text_general" indexed="true" stored="true"/> |
---|
97 | <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/> |
---|
98 | <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/> |
---|
99 | <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
100 | <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> |
---|
101 | |
---|
102 | <field name="weight" type="float" indexed="true" stored="true"/> |
---|
103 | <field name="price" type="float" indexed="true" stored="true"/> |
---|
104 | <field name="popularity" type="int" indexed="true" stored="true" /> |
---|
105 | <field name="inStock" type="boolean" indexed="true" stored="true" /> |
---|
106 | |
---|
107 | <field name="store" type="location" indexed="true" stored="true"/> |
---|
108 | |
---|
109 | <!-- Common metadata fields, named specifically to match up with |
---|
110 | SolrCell metadata when parsing rich documents such as Word, PDF. |
---|
111 | Some fields are multiValued only because Tika currently may return |
---|
112 | multiple values for them. |
---|
113 | --> |
---|
114 | <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
115 | <field name="subject" type="text_general" indexed="true" stored="true"/> |
---|
116 | <field name="description" type="text_general" indexed="true" stored="true"/> |
---|
117 | <field name="comments" type="text_general" indexed="true" stored="true"/> |
---|
118 | <field name="author" type="text_general" indexed="true" stored="true"/> |
---|
119 | <field name="keywords" type="text_general" indexed="true" stored="true"/> |
---|
120 | <field name="category" type="text_general" indexed="true" stored="true"/> |
---|
121 | <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> |
---|
122 | <field name="last_modified" type="date" indexed="true" stored="true"/> |
---|
123 | <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> |
---|
124 | |
---|
125 | |
---|
126 | <!-- catchall field, containing all other searchable text fields (implemented |
---|
127 | via copyField further on in this schema --> |
---|
128 | <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/> |
---|
129 | |
---|
130 | <!-- catchall text field that indexes tokens both normally and in reverse for efficient |
---|
131 | leading wildcard queries. --> |
---|
132 | <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/> |
---|
133 | |
---|
134 | <!-- non-tokenized version of manufacturer to make it easier to sort or group |
---|
135 | results by manufacturer. copied from "manu" via copyField --> |
---|
136 | <field name="manu_exact" type="string" indexed="true" stored="false"/> |
---|
137 | |
---|
138 | <field name="payloads" type="payloads" indexed="true" stored="true"/> |
---|
139 | |
---|
140 | |
---|
141 | <field name="_version_" type="long" indexed="true" stored="true"/> |
---|
142 | |
---|
143 | <!-- Uncommenting the following will create a "timestamp" field using |
---|
144 | a default value of "NOW" to indicate when each document was indexed. |
---|
145 | --> |
---|
146 | <!-- |
---|
147 | <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> |
---|
148 | --> |
---|
149 | |
---|
150 | <!-- Dynamic field definitions allow using convention over configuration |
---|
151 | for fields via the specification of patterns to match field names. |
---|
152 | EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) |
---|
153 | RESTRICTION: the glob-like pattern in the name attribute must have |
---|
154 | a "*" only at the start or the end. --> |
---|
155 | |
---|
156 | <dynamicField name="*_i" type="int" indexed="true" stored="true"/> |
---|
157 | <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/> |
---|
158 | <dynamicField name="*_s" type="string" indexed="true" stored="true" /> |
---|
159 | <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/> |
---|
160 | <dynamicField name="*_l" type="long" indexed="true" stored="true"/> |
---|
161 | <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/> |
---|
162 | <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/> |
---|
163 | <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
164 | <dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/> |
---|
165 | <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> |
---|
166 | <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true"/> |
---|
167 | <dynamicField name="*_f" type="float" indexed="true" stored="true"/> |
---|
168 | <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true"/> |
---|
169 | <dynamicField name="*_d" type="double" indexed="true" stored="true"/> |
---|
170 | <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/> |
---|
171 | |
---|
172 | <!-- Type used to index the lat and lon components for the "location" FieldType --> |
---|
173 | <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" /> |
---|
174 | |
---|
175 | <dynamicField name="*_dt" type="date" indexed="true" stored="true"/> |
---|
176 | <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/> |
---|
177 | <dynamicField name="*_p" type="location" indexed="true" stored="true"/> |
---|
178 | |
---|
179 | <!-- some trie-coded dynamic fields for faster range queries --> |
---|
180 | <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/> |
---|
181 | <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/> |
---|
182 | <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/> |
---|
183 | <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/> |
---|
184 | <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/> |
---|
185 | |
---|
186 | <dynamicField name="*_pi" type="pint" indexed="true" stored="true"/> |
---|
187 | <dynamicField name="*_c" type="currency" indexed="true" stored="true"/> |
---|
188 | |
---|
189 | <dynamicField name="ignored_*" type="ignored" multiValued="true"/> |
---|
190 | <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
191 | |
---|
192 | <dynamicField name="random_*" type="random" /> |
---|
193 | |
---|
194 | <!-- uncomment the following to ignore any fields that don't already match an existing |
---|
195 | field name or dynamic field, rather than reporting them as an error. |
---|
196 | alternately, change the type="ignored" to some other type e.g. "text" if you want |
---|
197 | unknown fields indexed and/or stored by default --> |
---|
198 | <!--dynamicField name="*" type="ignored" multiValued="true" /--> |
---|
199 | |
---|
200 | </fields> |
---|
201 | |
---|
202 | |
---|
203 | <!-- Field to use to determine and enforce document uniqueness. |
---|
204 | Unless this field is marked with required="false", it will be a required field |
---|
205 | --> |
---|
206 | <uniqueKey>id</uniqueKey> |
---|
207 | |
---|
208 | <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when |
---|
209 | parsing a query string that isn't explicit about the field. Machine (non-user) |
---|
210 | generated queries are best made explicit, or they can use the "df" request parameter |
---|
211 | which takes precedence over this. |
---|
212 | Note: Un-commenting defaultSearchField will be insufficient if your request handler |
---|
213 | in solrconfig.xml defines "df", which takes precedence. That would need to be removed. |
---|
214 | <defaultSearchField>text</defaultSearchField> --> |
---|
215 | |
---|
216 | <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers |
---|
217 | when parsing a query string to determine if a clause of the query should be marked as |
---|
218 | required or optional, assuming the clause isn't already marked by some operator. |
---|
219 | The default is OR, which is generally assumed so it is not a good idea to change it |
---|
220 | globally here. The "q.op" request parameter takes precedence over this. |
---|
221 | <solrQueryParser defaultOperator="OR"/> --> |
---|
222 | |
---|
223 | <!-- copyField commands copy one field to another at the time a document |
---|
224 | is added to the index. It's used either to index the same field differently, |
---|
225 | or to add multiple fields to the same field for easier/faster searching. --> |
---|
226 | |
---|
227 | <copyField source="cat" dest="text"/> |
---|
228 | <copyField source="name" dest="text"/> |
---|
229 | <copyField source="manu" dest="text"/> |
---|
230 | <copyField source="features" dest="text"/> |
---|
231 | <copyField source="includes" dest="text"/> |
---|
232 | <copyField source="manu" dest="manu_exact"/> |
---|
233 | |
---|
234 | <!-- Copy the price into a currency enabled field (default USD) --> |
---|
235 | <copyField source="price" dest="price_c"/> |
---|
236 | |
---|
237 | <!-- Above, multiple source fields are copied to the [text] field. |
---|
238 | Another way to map multiple source fields to the same |
---|
239 | destination field is to use the dynamic field syntax. |
---|
240 | copyField also supports a maxChars to copy setting. --> |
---|
241 | |
---|
242 | <!-- <copyField source="*_t" dest="text" maxChars="3000"/> --> |
---|
243 | |
---|
244 | <!-- copy name to alphaNameSort, a field designed for sorting by name --> |
---|
245 | <!-- <copyField source="name" dest="alphaNameSort"/> --> |
---|
246 | |
---|
247 | |
---|
248 | <!-- Similarity is the scoring routine for each document vs. a query. |
---|
249 | A custom similarity may be specified here, but the default is fine |
---|
250 | for most applications. --> |
---|
251 | <!-- <similarity class="org.apache.lucene.search.similarities.DefaultSimilarity"/> --> |
---|
252 | <!-- ... OR ... |
---|
253 | Specify a SimilarityFactory class name implementation |
---|
254 | allowing parameters to be used. |
---|
255 | --> |
---|
256 | <!-- |
---|
257 | <similarity class="com.example.solr.CustomSimilarityFactory"> |
---|
258 | <str name="paramkey">param value</str> |
---|
259 | </similarity> |
---|
260 | --> |
---|
261 | |
---|
262 | |
---|
263 | |
---|
264 | <types> |
---|
265 | <!-- field type definitions. The "name" attribute is |
---|
266 | just a label to be used by field definitions. The "class" |
---|
267 | attribute and any other attributes determine the real |
---|
268 | behavior of the fieldType. |
---|
269 | Class names starting with "solr" refer to java classes in a |
---|
270 | standard package such as org.apache.solr.analysis |
---|
271 | --> |
---|
272 | |
---|
273 | <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> |
---|
274 | <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> |
---|
275 | |
---|
276 | <!-- boolean type: "true" or "false" --> |
---|
277 | <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> |
---|
278 | |
---|
279 | <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are |
---|
280 | currently supported on types that are sorted internally as strings |
---|
281 | and on numeric types. |
---|
282 | This includes "string","boolean", and, as of 3.5 (and 4.x), |
---|
283 | int, float, long, date, double, including the "Trie" variants. |
---|
284 | - If sortMissingLast="true", then a sort on this field will cause documents |
---|
285 | without the field to come after documents with the field, |
---|
286 | regardless of the requested sort order (asc or desc). |
---|
287 | - If sortMissingFirst="true", then a sort on this field will cause documents |
---|
288 | without the field to come before documents with the field, |
---|
289 | regardless of the requested sort order. |
---|
290 | - If sortMissingLast="false" and sortMissingFirst="false" (the default), |
---|
291 | then default lucene sorting will be used which places docs without the |
---|
292 | field first in an ascending sort and last in a descending sort. |
---|
293 | --> |
---|
294 | |
---|
295 | <!-- |
---|
296 | Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. |
---|
297 | --> |
---|
298 | <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> |
---|
299 | <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/> |
---|
300 | <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> |
---|
301 | <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/> |
---|
302 | |
---|
303 | <!-- |
---|
304 | Numeric field types that index each value at various levels of precision |
---|
305 | to accelerate range queries when the number of values between the range |
---|
306 | endpoints is large. See the javadoc for NumericRangeQuery for internal |
---|
307 | implementation details. |
---|
308 | |
---|
309 | Smaller precisionStep values (specified in bits) will lead to more tokens |
---|
310 | indexed per value, slightly larger index size, and faster range queries. |
---|
311 | A precisionStep of 0 disables indexing at different precision levels. |
---|
312 | --> |
---|
313 | <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/> |
---|
314 | <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> |
---|
315 | <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/> |
---|
316 | <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/> |
---|
317 | |
---|
318 | <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and |
---|
319 | is a more restricted form of the canonical representation of dateTime |
---|
320 | http://www.w3.org/TR/xmlschema-2/#dateTime |
---|
321 | The trailing "Z" designates UTC time and is mandatory. |
---|
322 | Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z |
---|
323 | All other components are mandatory. |
---|
324 | |
---|
325 | Expressions can also be used to denote calculations that should be |
---|
326 | performed relative to "NOW" to determine the value, ie... |
---|
327 | |
---|
328 | NOW/HOUR |
---|
329 | ... Round to the start of the current hour |
---|
330 | NOW-1DAY |
---|
331 | ... Exactly 1 day prior to now |
---|
332 | NOW/DAY+6MONTHS+3DAYS |
---|
333 | ... 6 months and 3 days in the future from the start of |
---|
334 | the current day |
---|
335 | |
---|
336 | Consult the DateField javadocs for more information. |
---|
337 | |
---|
338 | Note: For faster range queries, consider the tdate type |
---|
339 | --> |
---|
340 | <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/> |
---|
341 | |
---|
342 | <!-- A Trie based date field for faster date range queries and date faceting. --> |
---|
343 | <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> |
---|
344 | |
---|
345 | |
---|
346 | <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> |
---|
347 | <fieldtype name="binary" class="solr.BinaryField"/> |
---|
348 | |
---|
349 | <!-- |
---|
350 | Note: |
---|
351 | These should only be used for compatibility with existing indexes (created with lucene or older Solr versions). |
---|
352 | Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last |
---|
353 | |
---|
354 | Plain numeric field types that store and index the text |
---|
355 | value verbatim (and hence don't correctly support range queries, since the |
---|
356 | lexicographic ordering isn't equal to the numeric ordering) |
---|
357 | --> |
---|
358 | <fieldType name="pint" class="solr.IntField"/> |
---|
359 | <fieldType name="plong" class="solr.LongField"/> |
---|
360 | <fieldType name="pfloat" class="solr.FloatField"/> |
---|
361 | <fieldType name="pdouble" class="solr.DoubleField"/> |
---|
362 | <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/> |
---|
363 | |
---|
364 | <!-- The "RandomSortField" is not used to store or search any |
---|
365 | data. You can declare fields of this type it in your schema |
---|
366 | to generate pseudo-random orderings of your docs for sorting |
---|
367 | or function purposes. The ordering is generated based on the field |
---|
368 | name and the version of the index. As long as the index version |
---|
369 | remains unchanged, and the same field name is reused, |
---|
370 | the ordering of the docs will be consistent. |
---|
371 | If you want different psuedo-random orderings of documents, |
---|
372 | for the same version of the index, use a dynamicField and |
---|
373 | change the field name in the request. |
---|
374 | --> |
---|
375 | <fieldType name="random" class="solr.RandomSortField" indexed="true" /> |
---|
376 | |
---|
377 | <!-- solr.TextField allows the specification of custom text analyzers |
---|
378 | specified as a tokenizer and a list of token filters. Different |
---|
379 | analyzers may be specified for indexing and querying. |
---|
380 | |
---|
381 | The optional positionIncrementGap puts space between multiple fields of |
---|
382 | this type on the same document, with the purpose of preventing false phrase |
---|
383 | matching across fields. |
---|
384 | |
---|
385 | For more info on customizing your analyzer chain, please see |
---|
386 | http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters |
---|
387 | --> |
---|
388 | |
---|
389 | <!-- One can also specify an existing Analyzer class that has a |
---|
390 | default constructor via the class attribute on the analyzer element. |
---|
391 | Example: |
---|
392 | <fieldType name="text_greek" class="solr.TextField"> |
---|
393 | <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> |
---|
394 | </fieldType> |
---|
395 | --> |
---|
396 | |
---|
397 | <!-- A text field that only splits on whitespace for exact matching of words --> |
---|
398 | <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> |
---|
399 | <analyzer> |
---|
400 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
401 | </analyzer> |
---|
402 | </fieldType> |
---|
403 | |
---|
404 | <!-- A general text field that has reasonable, generic |
---|
405 | cross-language defaults: it tokenizes with StandardTokenizer, |
---|
406 | removes stop words from case-insensitive "stopwords.txt" |
---|
407 | (empty by default), and down cases. At query time only, it |
---|
408 | also applies synonyms. --> |
---|
409 | <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> |
---|
410 | <analyzer type="index"> |
---|
411 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
412 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
413 | <!-- in this example, we will only use synonyms at query time |
---|
414 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
415 | --> |
---|
416 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
417 | </analyzer> |
---|
418 | <analyzer type="query"> |
---|
419 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
420 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
421 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
422 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
423 | </analyzer> |
---|
424 | </fieldType> |
---|
425 | |
---|
426 | <!-- A text field with defaults appropriate for English: it |
---|
427 | tokenizes with StandardTokenizer, removes English stop words |
---|
428 | (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and |
---|
429 | finally applies Porter's stemming. The query time analyzer |
---|
430 | also applies synonyms from synonyms.txt. --> |
---|
431 | <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> |
---|
432 | <analyzer type="index"> |
---|
433 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
434 | <!-- in this example, we will only use synonyms at query time |
---|
435 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
436 | --> |
---|
437 | <!-- Case insensitive stop word removal. |
---|
438 | add enablePositionIncrements=true in both the index and query |
---|
439 | analyzers to leave a 'gap' for more accurate phrase queries. |
---|
440 | --> |
---|
441 | <filter class="solr.StopFilterFactory" |
---|
442 | ignoreCase="true" |
---|
443 | words="lang/stopwords_en.txt" |
---|
444 | enablePositionIncrements="true" |
---|
445 | /> |
---|
446 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
447 | <filter class="solr.EnglishPossessiveFilterFactory"/> |
---|
448 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
449 | <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
---|
450 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
451 | --> |
---|
452 | <filter class="solr.PorterStemFilterFactory"/> |
---|
453 | </analyzer> |
---|
454 | <analyzer type="query"> |
---|
455 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
456 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
457 | <filter class="solr.StopFilterFactory" |
---|
458 | ignoreCase="true" |
---|
459 | words="lang/stopwords_en.txt" |
---|
460 | enablePositionIncrements="true" |
---|
461 | /> |
---|
462 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
463 | <filter class="solr.EnglishPossessiveFilterFactory"/> |
---|
464 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
465 | <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
---|
466 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
467 | --> |
---|
468 | <filter class="solr.PorterStemFilterFactory"/> |
---|
469 | </analyzer> |
---|
470 | </fieldType> |
---|
471 | |
---|
472 | <!-- A text field with defaults appropriate for English, plus |
---|
473 | aggressive word-splitting and autophrase features enabled. |
---|
474 | This field is just like text_en, except it adds |
---|
475 | WordDelimiterFilter to enable splitting and matching of |
---|
476 | words on case-change, alpha numeric boundaries, and |
---|
477 | non-alphanumeric chars. This means certain compound word |
---|
478 | cases will work, for example query "wi fi" will match |
---|
479 | document "WiFi" or "wi-fi". |
---|
480 | --> |
---|
481 | <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
---|
482 | <analyzer type="index"> |
---|
483 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
484 | <!-- in this example, we will only use synonyms at query time |
---|
485 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
486 | --> |
---|
487 | <!-- Case insensitive stop word removal. |
---|
488 | add enablePositionIncrements=true in both the index and query |
---|
489 | analyzers to leave a 'gap' for more accurate phrase queries. |
---|
490 | --> |
---|
491 | <filter class="solr.StopFilterFactory" |
---|
492 | ignoreCase="true" |
---|
493 | words="lang/stopwords_en.txt" |
---|
494 | enablePositionIncrements="true" |
---|
495 | /> |
---|
496 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> |
---|
497 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
498 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
499 | <filter class="solr.PorterStemFilterFactory"/> |
---|
500 | </analyzer> |
---|
501 | <analyzer type="query"> |
---|
502 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
503 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
504 | <filter class="solr.StopFilterFactory" |
---|
505 | ignoreCase="true" |
---|
506 | words="lang/stopwords_en.txt" |
---|
507 | enablePositionIncrements="true" |
---|
508 | /> |
---|
509 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
---|
510 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
511 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
512 | <filter class="solr.PorterStemFilterFactory"/> |
---|
513 | </analyzer> |
---|
514 | </fieldType> |
---|
515 | |
---|
516 | <!-- Less flexible matching, but less false matches. Probably not ideal for product names, |
---|
517 | but may be good for SKUs. Can insert dashes in the wrong place and still match. --> |
---|
518 | <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
---|
519 | <analyzer> |
---|
520 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
521 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> |
---|
522 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> |
---|
523 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
---|
524 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
525 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
526 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
527 | <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes |
---|
528 | possible with WordDelimiterFilter in conjuncton with stemming. --> |
---|
529 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
530 | </analyzer> |
---|
531 | </fieldType> |
---|
532 | |
---|
533 | <!-- Just like text_general except it reverses the characters of |
---|
534 | each token, to enable more efficient leading wildcard queries. --> |
---|
535 | <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> |
---|
536 | <analyzer type="index"> |
---|
537 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
538 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
539 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
540 | <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" |
---|
541 | maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> |
---|
542 | </analyzer> |
---|
543 | <analyzer type="query"> |
---|
544 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
545 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
546 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
547 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
548 | </analyzer> |
---|
549 | </fieldType> |
---|
550 | |
---|
551 | <!-- charFilter + WhitespaceTokenizer --> |
---|
552 | <!-- |
---|
553 | <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" > |
---|
554 | <analyzer> |
---|
555 | <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/> |
---|
556 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
557 | </analyzer> |
---|
558 | </fieldType> |
---|
559 | --> |
---|
560 | |
---|
561 | <!-- This is an example of using the KeywordTokenizer along |
---|
562 | With various TokenFilterFactories to produce a sortable field |
---|
563 | that does not include some properties of the source text |
---|
564 | --> |
---|
565 | <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true"> |
---|
566 | <analyzer> |
---|
567 | <!-- KeywordTokenizer does no actual tokenizing, so the entire |
---|
568 | input string is preserved as a single token |
---|
569 | --> |
---|
570 | <tokenizer class="solr.KeywordTokenizerFactory"/> |
---|
571 | <!-- The LowerCase TokenFilter does what you expect, which can be |
---|
572 | when you want your sorting to be case insensitive |
---|
573 | --> |
---|
574 | <filter class="solr.LowerCaseFilterFactory" /> |
---|
575 | <!-- The TrimFilter removes any leading or trailing whitespace --> |
---|
576 | <filter class="solr.TrimFilterFactory" /> |
---|
577 | <!-- The PatternReplaceFilter gives you the flexibility to use |
---|
578 | Java Regular expression to replace any sequence of characters |
---|
579 | matching a pattern with an arbitrary replacement string, |
---|
580 | which may include back references to portions of the original |
---|
581 | string matched by the pattern. |
---|
582 | |
---|
583 | See the Java Regular Expression documentation for more |
---|
584 | information on pattern and replacement string syntax. |
---|
585 | |
---|
586 | http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html |
---|
587 | --> |
---|
588 | <filter class="solr.PatternReplaceFilterFactory" |
---|
589 | pattern="([^a-z])" replacement="" replace="all" |
---|
590 | /> |
---|
591 | </analyzer> |
---|
592 | </fieldType> |
---|
593 | |
---|
594 | <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > |
---|
595 | <analyzer> |
---|
596 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
597 | <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> |
---|
598 | </analyzer> |
---|
599 | </fieldtype> |
---|
600 | |
---|
601 | <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > |
---|
602 | <analyzer> |
---|
603 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
604 | <!-- |
---|
605 | The DelimitedPayloadTokenFilter can put payloads on tokens... for example, |
---|
606 | a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f |
---|
607 | Attributes of the DelimitedPayloadTokenFilterFactory : |
---|
608 | "delimiter" - a one character delimiter. Default is | (pipe) |
---|
609 | "encoder" - how to encode the following value into a playload |
---|
610 | float -> org.apache.lucene.analysis.payloads.FloatEncoder, |
---|
611 | integer -> o.a.l.a.p.IntegerEncoder |
---|
612 | identity -> o.a.l.a.p.IdentityEncoder |
---|
613 | Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. |
---|
614 | --> |
---|
615 | <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> |
---|
616 | </analyzer> |
---|
617 | </fieldtype> |
---|
618 | |
---|
619 | <!-- lowercases the entire field value, keeping it as a single token. --> |
---|
620 | <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> |
---|
621 | <analyzer> |
---|
622 | <tokenizer class="solr.KeywordTokenizerFactory"/> |
---|
623 | <filter class="solr.LowerCaseFilterFactory" /> |
---|
624 | </analyzer> |
---|
625 | </fieldType> |
---|
626 | |
---|
627 | <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> |
---|
628 | <analyzer> |
---|
629 | <tokenizer class="solr.PathHierarchyTokenizerFactory"/> |
---|
630 | </analyzer> |
---|
631 | </fieldType> |
---|
632 | |
---|
633 | |
---|
634 | <!-- since fields of this type are by default not stored or indexed, |
---|
635 | any data added to them will be ignored outright. --> |
---|
636 | <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> |
---|
637 | |
---|
638 | <!-- This point type indexes the coordinates as separate fields (subFields) |
---|
639 | If subFieldType is defined, it references a type, and a dynamic field |
---|
640 | definition is created matching *___<typename>. Alternately, if |
---|
641 | subFieldSuffix is defined, that is used to create the subFields. |
---|
642 | Example: if subFieldType="double", then the coordinates would be |
---|
643 | indexed in fields myloc_0___double,myloc_1___double. |
---|
644 | Example: if subFieldSuffix="_d" then the coordinates would be indexed |
---|
645 | in fields myloc_0_d,myloc_1_d |
---|
646 | The subFields are an implementation detail of the fieldType, and end |
---|
647 | users normally should not need to know about them. |
---|
648 | --> |
---|
649 | <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> |
---|
650 | |
---|
651 | <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. --> |
---|
652 | <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> |
---|
653 | |
---|
654 | <!-- |
---|
655 | A Geohash is a compact representation of a latitude longitude pair in a single field. |
---|
656 | See http://wiki.apache.org/solr/SpatialSearch |
---|
657 | --> |
---|
658 | <fieldtype name="geohash" class="solr.GeoHashField"/> |
---|
659 | |
---|
660 | <!-- Money/currency field type. See http://wiki.apache.org/solr/MoneyFieldType |
---|
661 | Parameters: |
---|
662 | defaultCurrency: Specifies the default currency if none specified. Defaults to "USD" |
---|
663 | precisionStep: Specifies the precisionStep for the TrieLong field used for the amount |
---|
664 | providerClass: Lets you plug in other exchange provider backend: |
---|
665 | solr.FileExchangeRateProvider is the default and takes one parameter: |
---|
666 | currencyConfig: name of an xml file holding exhange rates |
---|
667 | solr.OpenExchangeRatesOrgProvider uses rates from openexchangerates.org: |
---|
668 | ratesFileLocation: URL or path to rates JSON file (default latest.json on the web) |
---|
669 | refreshInterval: Number of minutes between each rates fetch (default: 1440, min: 60) |
---|
670 | --> |
---|
671 | <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" /> |
---|
672 | |
---|
673 | |
---|
674 | |
---|
675 | <!-- some examples for different languages (generally ordered by ISO code) --> |
---|
676 | |
---|
677 | <!-- Arabic --> |
---|
678 | <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100"> |
---|
679 | <analyzer> |
---|
680 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
681 | <!-- for any non-arabic --> |
---|
682 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
683 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/> |
---|
684 | <!-- normalizes ﻯ to ﻱ, etc --> |
---|
685 | <filter class="solr.ArabicNormalizationFilterFactory"/> |
---|
686 | <filter class="solr.ArabicStemFilterFactory"/> |
---|
687 | </analyzer> |
---|
688 | </fieldType> |
---|
689 | |
---|
690 | <!-- Bulgarian --> |
---|
691 | <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100"> |
---|
692 | <analyzer> |
---|
693 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
694 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
695 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/> |
---|
696 | <filter class="solr.BulgarianStemFilterFactory"/> |
---|
697 | </analyzer> |
---|
698 | </fieldType> |
---|
699 | |
---|
700 | <!-- Catalan --> |
---|
701 | <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100"> |
---|
702 | <analyzer> |
---|
703 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
704 | <!-- removes l', etc --> |
---|
705 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/> |
---|
706 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
707 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/> |
---|
708 | <filter class="solr.SnowballPorterFilterFactory" language="Catalan"/> |
---|
709 | </analyzer> |
---|
710 | </fieldType> |
---|
711 | |
---|
712 | <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) --> |
---|
713 | <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> |
---|
714 | <analyzer> |
---|
715 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
716 | <!-- normalize width before bigram, as e.g. half-width dakuten combine --> |
---|
717 | <filter class="solr.CJKWidthFilterFactory"/> |
---|
718 | <!-- for any non-CJK --> |
---|
719 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
720 | <filter class="solr.CJKBigramFilterFactory"/> |
---|
721 | </analyzer> |
---|
722 | </fieldType> |
---|
723 | |
---|
724 | <!-- Czech --> |
---|
725 | <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> |
---|
726 | <analyzer> |
---|
727 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
728 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
729 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/> |
---|
730 | <filter class="solr.CzechStemFilterFactory"/> |
---|
731 | </analyzer> |
---|
732 | </fieldType> |
---|
733 | |
---|
734 | <!-- Danish --> |
---|
735 | <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100"> |
---|
736 | <analyzer> |
---|
737 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
738 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
739 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/> |
---|
740 | <filter class="solr.SnowballPorterFilterFactory" language="Danish"/> |
---|
741 | </analyzer> |
---|
742 | </fieldType> |
---|
743 | |
---|
744 | <!-- German --> |
---|
745 | <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100"> |
---|
746 | <analyzer> |
---|
747 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
748 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
749 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/> |
---|
750 | <filter class="solr.GermanNormalizationFilterFactory"/> |
---|
751 | <filter class="solr.GermanLightStemFilterFactory"/> |
---|
752 | <!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> --> |
---|
753 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="German2"/> --> |
---|
754 | </analyzer> |
---|
755 | </fieldType> |
---|
756 | |
---|
757 | <!-- Greek --> |
---|
758 | <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100"> |
---|
759 | <analyzer> |
---|
760 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
761 | <!-- greek specific lowercase for sigma --> |
---|
762 | <filter class="solr.GreekLowerCaseFilterFactory"/> |
---|
763 | <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/> |
---|
764 | <filter class="solr.GreekStemFilterFactory"/> |
---|
765 | </analyzer> |
---|
766 | </fieldType> |
---|
767 | |
---|
768 | <!-- Spanish --> |
---|
769 | <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100"> |
---|
770 | <analyzer> |
---|
771 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
772 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
773 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/> |
---|
774 | <filter class="solr.SpanishLightStemFilterFactory"/> |
---|
775 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> --> |
---|
776 | </analyzer> |
---|
777 | </fieldType> |
---|
778 | |
---|
779 | <!-- Basque --> |
---|
780 | <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100"> |
---|
781 | <analyzer> |
---|
782 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
783 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
784 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/> |
---|
785 | <filter class="solr.SnowballPorterFilterFactory" language="Basque"/> |
---|
786 | </analyzer> |
---|
787 | </fieldType> |
---|
788 | |
---|
789 | <!-- Persian --> |
---|
790 | <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100"> |
---|
791 | <analyzer> |
---|
792 | <!-- for ZWNJ --> |
---|
793 | <charFilter class="solr.PersianCharFilterFactory"/> |
---|
794 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
795 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
796 | <filter class="solr.ArabicNormalizationFilterFactory"/> |
---|
797 | <filter class="solr.PersianNormalizationFilterFactory"/> |
---|
798 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/> |
---|
799 | </analyzer> |
---|
800 | </fieldType> |
---|
801 | |
---|
802 | <!-- Finnish --> |
---|
803 | <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100"> |
---|
804 | <analyzer> |
---|
805 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
806 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
807 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/> |
---|
808 | <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/> |
---|
809 | <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> --> |
---|
810 | </analyzer> |
---|
811 | </fieldType> |
---|
812 | |
---|
813 | <!-- French --> |
---|
814 | <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100"> |
---|
815 | <analyzer> |
---|
816 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
817 | <!-- removes l', etc --> |
---|
818 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/> |
---|
819 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
820 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/> |
---|
821 | <filter class="solr.FrenchLightStemFilterFactory"/> |
---|
822 | <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> --> |
---|
823 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> --> |
---|
824 | </analyzer> |
---|
825 | </fieldType> |
---|
826 | |
---|
827 | <!-- Irish --> |
---|
828 | <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100"> |
---|
829 | <analyzer> |
---|
830 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
831 | <!-- removes d', etc --> |
---|
832 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/> |
---|
833 | <!-- removes n-, etc. position increments is intentionally false! --> |
---|
834 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/> |
---|
835 | <filter class="solr.IrishLowerCaseFilterFactory"/> |
---|
836 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/> |
---|
837 | <filter class="solr.SnowballPorterFilterFactory" language="Irish"/> |
---|
838 | </analyzer> |
---|
839 | </fieldType> |
---|
840 | |
---|
841 | <!-- Galician --> |
---|
842 | <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100"> |
---|
843 | <analyzer> |
---|
844 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
845 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
846 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/> |
---|
847 | <filter class="solr.GalicianStemFilterFactory"/> |
---|
848 | <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> --> |
---|
849 | </analyzer> |
---|
850 | </fieldType> |
---|
851 | |
---|
852 | <!-- Hindi --> |
---|
853 | <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100"> |
---|
854 | <analyzer> |
---|
855 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
856 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
857 | <!-- normalizes unicode representation --> |
---|
858 | <filter class="solr.IndicNormalizationFilterFactory"/> |
---|
859 | <!-- normalizes variation in spelling --> |
---|
860 | <filter class="solr.HindiNormalizationFilterFactory"/> |
---|
861 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/> |
---|
862 | <filter class="solr.HindiStemFilterFactory"/> |
---|
863 | </analyzer> |
---|
864 | </fieldType> |
---|
865 | |
---|
866 | <!-- Hungarian --> |
---|
867 | <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100"> |
---|
868 | <analyzer> |
---|
869 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
870 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
871 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/> |
---|
872 | <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/> |
---|
873 | <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> --> |
---|
874 | </analyzer> |
---|
875 | </fieldType> |
---|
876 | |
---|
877 | <!-- Armenian --> |
---|
878 | <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100"> |
---|
879 | <analyzer> |
---|
880 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
881 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
882 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/> |
---|
883 | <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/> |
---|
884 | </analyzer> |
---|
885 | </fieldType> |
---|
886 | |
---|
887 | <!-- Indonesian --> |
---|
888 | <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100"> |
---|
889 | <analyzer> |
---|
890 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
891 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
892 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/> |
---|
893 | <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false --> |
---|
894 | <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/> |
---|
895 | </analyzer> |
---|
896 | </fieldType> |
---|
897 | |
---|
898 | <!-- Italian --> |
---|
899 | <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100"> |
---|
900 | <analyzer> |
---|
901 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
902 | <!-- removes l', etc --> |
---|
903 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/> |
---|
904 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
905 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/> |
---|
906 | <filter class="solr.ItalianLightStemFilterFactory"/> |
---|
907 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> --> |
---|
908 | </analyzer> |
---|
909 | </fieldType> |
---|
910 | |
---|
911 | <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming) |
---|
912 | |
---|
913 | NOTE: If you want to optimize search for precision, use default operator AND in your query |
---|
914 | parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use |
---|
915 | OR if you would like to optimize for recall (default). |
---|
916 | --> |
---|
917 | <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> |
---|
918 | <analyzer> |
---|
919 | <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer) |
---|
920 | |
---|
921 | Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic |
---|
922 | is used to segment compounds into its parts and the compound itself is kept as synonym. |
---|
923 | |
---|
924 | Valid values for attribute mode are: |
---|
925 | normal: regular segmentation |
---|
926 | search: segmentation useful for search with synonyms compounds (default) |
---|
927 | extended: same as search mode, but unigrams unknown words (experimental) |
---|
928 | |
---|
929 | For some applications it might be good to use search mode for indexing and normal mode for |
---|
930 | queries to reduce recall and prevent parts of compounds from being matched and highlighted. |
---|
931 | Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query. |
---|
932 | |
---|
933 | Kuromoji also has a convenient user dictionary feature that allows overriding the statistical |
---|
934 | model with your own entries for segmentation, part-of-speech tags and readings without a need |
---|
935 | to specify weights. Notice that user dictionaries have not been subject to extensive testing. |
---|
936 | |
---|
937 | User dictionary attributes are: |
---|
938 | userDictionary: user dictionary filename |
---|
939 | userDictionaryEncoding: user dictionary encoding (default is UTF-8) |
---|
940 | |
---|
941 | See lang/userdict_ja.txt for a sample user dictionary file. |
---|
942 | |
---|
943 | See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. |
---|
944 | --> |
---|
945 | <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> |
---|
946 | <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> |
---|
947 | <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (èŸæžåœ¢) --> |
---|
948 | <filter class="solr.JapaneseBaseFormFilterFactory"/> |
---|
949 | <!-- Removes tokens with certain part-of-speech tags --> |
---|
950 | <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> |
---|
951 | <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> |
---|
952 | <filter class="solr.CJKWidthFilterFactory"/> |
---|
953 | <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> |
---|
954 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> |
---|
955 | <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> |
---|
956 | <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> |
---|
957 | <!-- Lower-cases romaji characters --> |
---|
958 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
959 | </analyzer> |
---|
960 | </fieldType> |
---|
961 | |
---|
962 | <!-- Latvian --> |
---|
963 | <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100"> |
---|
964 | <analyzer> |
---|
965 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
966 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
967 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/> |
---|
968 | <filter class="solr.LatvianStemFilterFactory"/> |
---|
969 | </analyzer> |
---|
970 | </fieldType> |
---|
971 | |
---|
972 | <!-- Dutch --> |
---|
973 | <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100"> |
---|
974 | <analyzer> |
---|
975 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
976 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
977 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/> |
---|
978 | <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/> |
---|
979 | <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/> |
---|
980 | </analyzer> |
---|
981 | </fieldType> |
---|
982 | |
---|
983 | <!-- Norwegian --> |
---|
984 | <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100"> |
---|
985 | <analyzer> |
---|
986 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
987 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
988 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/> |
---|
989 | <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/> |
---|
990 | <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> --> |
---|
991 | <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> --> |
---|
992 | </analyzer> |
---|
993 | </fieldType> |
---|
994 | |
---|
995 | <!-- Portuguese --> |
---|
996 | <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> |
---|
997 | <analyzer> |
---|
998 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
999 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1000 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/> |
---|
1001 | <filter class="solr.PortugueseLightStemFilterFactory"/> |
---|
1002 | <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
---|
1003 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
---|
1004 | <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> |
---|
1005 | </analyzer> |
---|
1006 | </fieldType> |
---|
1007 | |
---|
1008 | <!-- Romanian --> |
---|
1009 | <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100"> |
---|
1010 | <analyzer> |
---|
1011 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1012 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1013 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/> |
---|
1014 | <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/> |
---|
1015 | </analyzer> |
---|
1016 | </fieldType> |
---|
1017 | |
---|
1018 | <!-- Russian --> |
---|
1019 | <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100"> |
---|
1020 | <analyzer> |
---|
1021 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1022 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1023 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/> |
---|
1024 | <filter class="solr.SnowballPorterFilterFactory" language="Russian"/> |
---|
1025 | <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> --> |
---|
1026 | </analyzer> |
---|
1027 | </fieldType> |
---|
1028 | |
---|
1029 | <!-- Swedish --> |
---|
1030 | <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100"> |
---|
1031 | <analyzer> |
---|
1032 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1033 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1034 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/> |
---|
1035 | <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/> |
---|
1036 | <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> --> |
---|
1037 | </analyzer> |
---|
1038 | </fieldType> |
---|
1039 | |
---|
1040 | <!-- Thai --> |
---|
1041 | <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100"> |
---|
1042 | <analyzer> |
---|
1043 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1044 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1045 | <filter class="solr.ThaiWordFilterFactory"/> |
---|
1046 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/> |
---|
1047 | </analyzer> |
---|
1048 | </fieldType> |
---|
1049 | |
---|
1050 | <!-- Turkish --> |
---|
1051 | <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100"> |
---|
1052 | <analyzer> |
---|
1053 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1054 | <filter class="solr.TurkishLowerCaseFilterFactory"/> |
---|
1055 | <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/> |
---|
1056 | <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/> |
---|
1057 | </analyzer> |
---|
1058 | </fieldType> |
---|
1059 | </types> |
---|
1060 | |
---|
1061 | |
---|
1062 | |
---|
1063 | </schema> |
---|