1 | <?xml version="1.0" encoding="UTF-8" ?> |
---|
2 | <!-- |
---|
3 | Licensed to the Apache Software Foundation (ASF) under one or more |
---|
4 | contributor license agreements. See the NOTICE file distributed with |
---|
5 | this work for additional information regarding copyright ownership. |
---|
6 | The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
7 | (the "License"); you may not use this file except in compliance with |
---|
8 | the License. You may obtain a copy of the License at |
---|
9 | |
---|
10 | http://www.apache.org/licenses/LICENSE-2.0 |
---|
11 | |
---|
12 | Unless required by applicable law or agreed to in writing, software |
---|
13 | distributed under the License is distributed on an "AS IS" BASIS, |
---|
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
15 | See the License for the specific language governing permissions and |
---|
16 | limitations under the License. |
---|
17 | --> |
---|
18 | |
---|
19 | <!-- |
---|
20 | This is the Solr schema file. This file should be named "schema.xml" and |
---|
21 | should be in the conf directory under the solr home |
---|
22 | (i.e. ./solr/conf/schema.xml by default) |
---|
23 | or located where the classloader for the Solr webapp can find it. |
---|
24 | |
---|
25 | This example schema is the recommended starting point for users. |
---|
26 | It should be kept correct and concise, usable out-of-the-box. |
---|
27 | |
---|
28 | For more information, on how to customize this file, please see |
---|
29 | http://wiki.apache.org/solr/SchemaXml |
---|
30 | |
---|
31 | PERFORMANCE NOTE: this schema includes many optional features and should not |
---|
32 | be used for benchmarking. To improve performance one could |
---|
33 | - set stored="false" for all fields possible (esp large fields) when you |
---|
34 | only need to search on the field but don't need to return the original |
---|
35 | value. |
---|
36 | - set indexed="false" if you don't need to search on the field, but only |
---|
37 | return the field as a result of searching on other indexed fields. |
---|
38 | - remove all unneeded copyField statements |
---|
39 | - for best index size and searching performance, set "index" to false |
---|
40 | for all general text fields, use copyField to copy them to the |
---|
41 | catchall "text" field, and use that for searching. |
---|
42 | - For maximum indexing performance, use the StreamingUpdateSolrServer |
---|
43 | java client. |
---|
44 | - Remember to run the JVM in server mode, and use a higher logging level |
---|
45 | that avoids logging every request |
---|
46 | --> |
---|
47 | |
---|
48 | <schema name="teste" version="1.5"> |
---|
49 | <!-- attribute "name" is the name of this schema and is only used for display purposes. |
---|
50 | version="x.y" is Solr's version number for the schema syntax and semantics. It should |
---|
51 | not normally be changed by applications. |
---|
52 | 1.0: multiValued attribute did not exist, all fields are multiValued by nature |
---|
53 | 1.1: multiValued attribute introduced, false by default |
---|
54 | 1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields. |
---|
55 | 1.3: removed optional field compress feature |
---|
56 | 1.4: default auto-phrase (QueryParser feature) to off |
---|
57 | 1.5: omitNorms defaults to true for primitive field types (int, float, boolean, string...) |
---|
58 | --> |
---|
59 | |
---|
60 | <fields> |
---|
61 | <!-- Valid attributes for fields: |
---|
62 | name: mandatory - the name for the field |
---|
63 | type: mandatory - the name of a field type from the |
---|
64 | <types> fieldType section |
---|
65 | indexed: true if this field should be indexed (searchable or sortable) |
---|
66 | stored: true if this field should be retrievable |
---|
67 | multiValued: true if this field may contain multiple values per document |
---|
68 | omitNorms: (expert) set to true to omit the norms associated with |
---|
69 | this field (this disables length normalization and index-time |
---|
70 | boosting for the field, and saves some memory). Only full-text |
---|
71 | fields or fields that need an index-time boost need norms. |
---|
72 | Norms are omitted for primitive (non-analyzed) types by default. |
---|
73 | termVectors: [false] set to true to store the term vector for a |
---|
74 | given field. |
---|
75 | When using MoreLikeThis, fields used for similarity should be |
---|
76 | stored for best performance. |
---|
77 | termPositions: Store position information with the term vector. |
---|
78 | This will increase storage costs. |
---|
79 | termOffsets: Store offset information with the term vector. This |
---|
80 | will increase storage costs. |
---|
81 | required: The field is required. It will throw an error if the |
---|
82 | value does not exist |
---|
83 | default: a value that should be used if no value is specified |
---|
84 | when adding a document. |
---|
85 | --> |
---|
86 | |
---|
87 | <!-- field names should consist of alphanumeric or underscore characters only and |
---|
88 | not start with a digit. This is not currently strictly enforced, |
---|
89 | but other field names will not have first class support from all components |
---|
90 | and back compatibility is not guaranteed. Names with both leading and |
---|
91 | trailing underscores (e.g. _version_) are reserved. |
---|
92 | --> |
---|
93 | |
---|
94 | <field name="id" type="string" indexed="true" stored="true" required="true" /> |
---|
95 | <field name="user" type="string" indexed="true" stored="true" required="true" /> |
---|
96 | <field name="msg_no" type="string" indexed="true" stored="true" required="true" /> |
---|
97 | <field name="folder" type="string" indexed="true" stored="true" required="true" /> |
---|
98 | <field name="from" type="text_general" indexed="true" stored="true"/> |
---|
99 | <field name="to" type="text_general" indexed="true" stored="true"/> |
---|
100 | <field name="subject" type="text_general" indexed="true" stored="true" /> |
---|
101 | <field name="content" type="text_general" indexed="true" stored="true" /> |
---|
102 | <field name="copyto" type="text_general" indexed="true" stored="true" /> |
---|
103 | <field name="hiddencopyto" type="text_general" indexed="true" stored="true" /> |
---|
104 | <field name="sent_date" type="text_general" indexed="true" stored="true" /> |
---|
105 | <field name="text" type="text_general" indexed="true" stored="true" /> |
---|
106 | |
---|
107 | <!-- Common metadata fields, named specifically to match up with |
---|
108 | SolrCell metadata when parsing rich documents such as Word, PDF. |
---|
109 | Some fields are multiValued only because Tika currently may return |
---|
110 | multiple values for them. |
---|
111 | --> |
---|
112 | |
---|
113 | |
---|
114 | <!-- catchall field, containing all other searchable text fields (implemented |
---|
115 | via copyField further on in this schema --> |
---|
116 | |
---|
117 | <!-- catchall text field that indexes tokens both normally and in reverse for efficient |
---|
118 | leading wildcard queries. --> |
---|
119 | |
---|
120 | <!-- non-tokenized version of manufacturer to make it easier to sort or group |
---|
121 | results by manufacturer. copied from "manu" via copyField --> |
---|
122 | |
---|
123 | <!-- Uncommenting the following will create a "timestamp" field using |
---|
124 | a default value of "NOW" to indicate when each document was indexed. |
---|
125 | --> |
---|
126 | <!-- |
---|
127 | <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> |
---|
128 | --> |
---|
129 | |
---|
130 | <!-- Dynamic field definitions allow using convention over configuration |
---|
131 | for fields via the specification of patterns to match field names. |
---|
132 | EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) |
---|
133 | RESTRICTION: the glob-like pattern in the name attribute must have |
---|
134 | a "*" only at the start or the end. --> |
---|
135 | |
---|
136 | <dynamicField name="*_i" type="int" indexed="true" stored="true"/> |
---|
137 | <dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/> |
---|
138 | <dynamicField name="*_s" type="string" indexed="true" stored="true" /> |
---|
139 | <dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/> |
---|
140 | <dynamicField name="*_l" type="long" indexed="true" stored="true"/> |
---|
141 | <dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/> |
---|
142 | <dynamicField name="*_t" type="text_general" indexed="true" stored="true"/> |
---|
143 | <dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
144 | <dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/> |
---|
145 | <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> |
---|
146 | <dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true"/> |
---|
147 | <dynamicField name="*_f" type="float" indexed="true" stored="true"/> |
---|
148 | <dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true"/> |
---|
149 | <dynamicField name="*_d" type="double" indexed="true" stored="true"/> |
---|
150 | <dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/> |
---|
151 | |
---|
152 | <!-- Type used to index the lat and lon components for the "location" FieldType --> |
---|
153 | <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" /> |
---|
154 | |
---|
155 | <dynamicField name="*_dt" type="date" indexed="true" stored="true"/> |
---|
156 | <dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/> |
---|
157 | <dynamicField name="*_p" type="location" indexed="true" stored="true"/> |
---|
158 | |
---|
159 | <!-- some trie-coded dynamic fields for faster range queries --> |
---|
160 | <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/> |
---|
161 | <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/> |
---|
162 | <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/> |
---|
163 | <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/> |
---|
164 | <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/> |
---|
165 | |
---|
166 | <dynamicField name="*_pi" type="pint" indexed="true" stored="true"/> |
---|
167 | <dynamicField name="*_c" type="currency" indexed="true" stored="true"/> |
---|
168 | |
---|
169 | <dynamicField name="ignored_*" type="ignored" multiValued="true"/> |
---|
170 | <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/> |
---|
171 | |
---|
172 | <dynamicField name="random_*" type="random" /> |
---|
173 | |
---|
174 | <!-- uncomment the following to ignore any fields that don't already match an existing |
---|
175 | field name or dynamic field, rather than reporting them as an error. |
---|
176 | alternately, change the type="ignored" to some other type e.g. "text" if you want |
---|
177 | unknown fields indexed and/or stored by default --> |
---|
178 | <!--dynamicField name="*" type="ignored" multiValued="true" /--> |
---|
179 | |
---|
180 | </fields> |
---|
181 | |
---|
182 | |
---|
183 | <!-- Field to use to determine and enforce document uniqueness. |
---|
184 | Unless this field is marked with required="false", it will be a required field |
---|
185 | --> |
---|
186 | <uniqueKey>id</uniqueKey> |
---|
187 | |
---|
188 | <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when |
---|
189 | parsing a query string that isn't explicit about the field. Machine (non-user) |
---|
190 | generated queries are best made explicit, or they can use the "df" request parameter |
---|
191 | which takes precedence over this. |
---|
192 | Note: Un-commenting defaultSearchField will be insufficient if your request handler |
---|
193 | in solrconfig.xml defines "df", which takes precedence. That would need to be removed. |
---|
194 | <defaultSearchField>text</defaultSearchField> --> |
---|
195 | |
---|
196 | <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers |
---|
197 | when parsing a query string to determine if a clause of the query should be marked as |
---|
198 | required or optional, assuming the clause isn't already marked by some operator. |
---|
199 | The default is OR, which is generally assumed so it is not a good idea to change it |
---|
200 | globally here. The "q.op" request parameter takes precedence over this. |
---|
201 | <solrQueryParser defaultOperator="OR"/> --> |
---|
202 | |
---|
203 | <!-- copyField commands copy one field to another at the time a document |
---|
204 | is added to the index. It's used either to index the same field differently, |
---|
205 | or to add multiple fields to the same field for easier/faster searching. --> |
---|
206 | |
---|
207 | <!-- Copy the price into a currency enabled field (default USD) --> |
---|
208 | |
---|
209 | <!-- Above, multiple source fields are copied to the [text] field. |
---|
210 | Another way to map multiple source fields to the same |
---|
211 | destination field is to use the dynamic field syntax. |
---|
212 | copyField also supports a maxChars to copy setting. --> |
---|
213 | |
---|
214 | <!-- <copyField source="*_t" dest="text" maxChars="3000"/> --> |
---|
215 | |
---|
216 | <!-- copy name to alphaNameSort, a field designed for sorting by name --> |
---|
217 | <!-- <copyField source="name" dest="alphaNameSort"/> --> |
---|
218 | |
---|
219 | |
---|
220 | <!-- Similarity is the scoring routine for each document vs. a query. |
---|
221 | A custom similarity may be specified here, but the default is fine |
---|
222 | for most applications. --> |
---|
223 | <!-- <similarity class="org.apache.lucene.search.similarities.DefaultSimilarity"/> --> |
---|
224 | <!-- ... OR ... |
---|
225 | Specify a SimilarityFactory class name implementation |
---|
226 | allowing parameters to be used. |
---|
227 | --> |
---|
228 | <!-- |
---|
229 | <similarity class="com.example.solr.CustomSimilarityFactory"> |
---|
230 | <str name="paramkey">param value</str> |
---|
231 | </similarity> |
---|
232 | --> |
---|
233 | |
---|
234 | |
---|
235 | |
---|
236 | <types> |
---|
237 | <!-- field type definitions. The "name" attribute is |
---|
238 | just a label to be used by field definitions. The "class" |
---|
239 | attribute and any other attributes determine the real |
---|
240 | behavior of the fieldType. |
---|
241 | Class names starting with "solr" refer to java classes in a |
---|
242 | standard package such as org.apache.solr.analysis |
---|
243 | --> |
---|
244 | |
---|
245 | <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> |
---|
246 | <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> |
---|
247 | |
---|
248 | <!-- boolean type: "true" or "false" --> |
---|
249 | <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> |
---|
250 | |
---|
251 | <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are |
---|
252 | currently supported on types that are sorted internally as strings |
---|
253 | and on numeric types. |
---|
254 | This includes "string","boolean", and, as of 3.5 (and 4.x), |
---|
255 | int, float, long, date, double, including the "Trie" variants. |
---|
256 | - If sortMissingLast="true", then a sort on this field will cause documents |
---|
257 | without the field to come after documents with the field, |
---|
258 | regardless of the requested sort order (asc or desc). |
---|
259 | - If sortMissingFirst="true", then a sort on this field will cause documents |
---|
260 | without the field to come before documents with the field, |
---|
261 | regardless of the requested sort order. |
---|
262 | - If sortMissingLast="false" and sortMissingFirst="false" (the default), |
---|
263 | then default lucene sorting will be used which places docs without the |
---|
264 | field first in an ascending sort and last in a descending sort. |
---|
265 | --> |
---|
266 | |
---|
267 | <!-- |
---|
268 | Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. |
---|
269 | --> |
---|
270 | <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/> |
---|
271 | <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/> |
---|
272 | <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> |
---|
273 | <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/> |
---|
274 | |
---|
275 | <!-- |
---|
276 | Numeric field types that index each value at various levels of precision |
---|
277 | to accelerate range queries when the number of values between the range |
---|
278 | endpoints is large. See the javadoc for NumericRangeQuery for internal |
---|
279 | implementation details. |
---|
280 | |
---|
281 | Smaller precisionStep values (specified in bits) will lead to more tokens |
---|
282 | indexed per value, slightly larger index size, and faster range queries. |
---|
283 | A precisionStep of 0 disables indexing at different precision levels. |
---|
284 | --> |
---|
285 | <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/> |
---|
286 | <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> |
---|
287 | <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/> |
---|
288 | <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/> |
---|
289 | |
---|
290 | <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and |
---|
291 | is a more restricted form of the canonical representation of dateTime |
---|
292 | http://www.w3.org/TR/xmlschema-2/#dateTime |
---|
293 | The trailing "Z" designates UTC time and is mandatory. |
---|
294 | Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z |
---|
295 | All other components are mandatory. |
---|
296 | |
---|
297 | Expressions can also be used to denote calculations that should be |
---|
298 | performed relative to "NOW" to determine the value, ie... |
---|
299 | |
---|
300 | NOW/HOUR |
---|
301 | ... Round to the start of the current hour |
---|
302 | NOW-1DAY |
---|
303 | ... Exactly 1 day prior to now |
---|
304 | NOW/DAY+6MONTHS+3DAYS |
---|
305 | ... 6 months and 3 days in the future from the start of |
---|
306 | the current day |
---|
307 | |
---|
308 | Consult the DateField javadocs for more information. |
---|
309 | |
---|
310 | Note: For faster range queries, consider the tdate type |
---|
311 | --> |
---|
312 | <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/> |
---|
313 | |
---|
314 | <!-- A Trie based date field for faster date range queries and date faceting. --> |
---|
315 | <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/> |
---|
316 | |
---|
317 | |
---|
318 | <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> |
---|
319 | <fieldtype name="binary" class="solr.BinaryField"/> |
---|
320 | |
---|
321 | <!-- |
---|
322 | Note: |
---|
323 | These should only be used for compatibility with existing indexes (created with lucene or older Solr versions). |
---|
324 | Use Trie based fields instead. As of Solr 3.5 and 4.x, Trie based fields support sortMissingFirst/Last |
---|
325 | |
---|
326 | Plain numeric field types that store and index the text |
---|
327 | value verbatim (and hence don't correctly support range queries, since the |
---|
328 | lexicographic ordering isn't equal to the numeric ordering) |
---|
329 | --> |
---|
330 | <fieldType name="pint" class="solr.IntField"/> |
---|
331 | <fieldType name="plong" class="solr.LongField"/> |
---|
332 | <fieldType name="pfloat" class="solr.FloatField"/> |
---|
333 | <fieldType name="pdouble" class="solr.DoubleField"/> |
---|
334 | <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/> |
---|
335 | |
---|
336 | <!-- The "RandomSortField" is not used to store or search any |
---|
337 | data. You can declare fields of this type it in your schema |
---|
338 | to generate pseudo-random orderings of your docs for sorting |
---|
339 | or function purposes. The ordering is generated based on the field |
---|
340 | name and the version of the index. As long as the index version |
---|
341 | remains unchanged, and the same field name is reused, |
---|
342 | the ordering of the docs will be consistent. |
---|
343 | If you want different psuedo-random orderings of documents, |
---|
344 | for the same version of the index, use a dynamicField and |
---|
345 | change the field name in the request. |
---|
346 | --> |
---|
347 | <fieldType name="random" class="solr.RandomSortField" indexed="true" /> |
---|
348 | |
---|
349 | <!-- solr.TextField allows the specification of custom text analyzers |
---|
350 | specified as a tokenizer and a list of token filters. Different |
---|
351 | analyzers may be specified for indexing and querying. |
---|
352 | |
---|
353 | The optional positionIncrementGap puts space between multiple fields of |
---|
354 | this type on the same document, with the purpose of preventing false phrase |
---|
355 | matching across fields. |
---|
356 | |
---|
357 | For more info on customizing your analyzer chain, please see |
---|
358 | http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters |
---|
359 | --> |
---|
360 | |
---|
361 | <!-- One can also specify an existing Analyzer class that has a |
---|
362 | default constructor via the class attribute on the analyzer element. |
---|
363 | Example: |
---|
364 | <fieldType name="text_greek" class="solr.TextField"> |
---|
365 | <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> |
---|
366 | </fieldType> |
---|
367 | --> |
---|
368 | |
---|
369 | <!-- A text field that only splits on whitespace for exact matching of words --> |
---|
370 | <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> |
---|
371 | <analyzer> |
---|
372 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
373 | </analyzer> |
---|
374 | </fieldType> |
---|
375 | |
---|
376 | <!-- A general text field that has reasonable, generic |
---|
377 | cross-language defaults: it tokenizes with StandardTokenizer, |
---|
378 | removes stop words from case-insensitive "stopwords.txt" |
---|
379 | (empty by default), and down cases. At query time only, it |
---|
380 | also applies synonyms. --> |
---|
381 | <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> |
---|
382 | <analyzer type="index"> |
---|
383 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
384 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
385 | <!-- in this example, we will only use synonyms at query time |
---|
386 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
387 | --> |
---|
388 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
389 | </analyzer> |
---|
390 | <analyzer type="query"> |
---|
391 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
392 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
393 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
394 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
395 | </analyzer> |
---|
396 | </fieldType> |
---|
397 | |
---|
398 | <!-- A text field with defaults appropriate for English: it |
---|
399 | tokenizes with StandardTokenizer, removes English stop words |
---|
400 | (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and |
---|
401 | finally applies Porter's stemming. The query time analyzer |
---|
402 | also applies synonyms from synonyms.txt. --> |
---|
403 | <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> |
---|
404 | <analyzer type="index"> |
---|
405 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
406 | <!-- in this example, we will only use synonyms at query time |
---|
407 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
408 | --> |
---|
409 | <!-- Case insensitive stop word removal. |
---|
410 | add enablePositionIncrements=true in both the index and query |
---|
411 | analyzers to leave a 'gap' for more accurate phrase queries. |
---|
412 | --> |
---|
413 | <filter class="solr.StopFilterFactory" |
---|
414 | ignoreCase="true" |
---|
415 | words="lang/stopwords_en.txt" |
---|
416 | enablePositionIncrements="true" |
---|
417 | /> |
---|
418 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
419 | <filter class="solr.EnglishPossessiveFilterFactory"/> |
---|
420 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
421 | <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
---|
422 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
423 | --> |
---|
424 | <filter class="solr.PorterStemFilterFactory"/> |
---|
425 | </analyzer> |
---|
426 | <analyzer type="query"> |
---|
427 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
428 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
429 | <filter class="solr.StopFilterFactory" |
---|
430 | ignoreCase="true" |
---|
431 | words="lang/stopwords_en.txt" |
---|
432 | enablePositionIncrements="true" |
---|
433 | /> |
---|
434 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
435 | <filter class="solr.EnglishPossessiveFilterFactory"/> |
---|
436 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
437 | <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: |
---|
438 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
439 | --> |
---|
440 | <filter class="solr.PorterStemFilterFactory"/> |
---|
441 | </analyzer> |
---|
442 | </fieldType> |
---|
443 | |
---|
444 | <!-- A text field with defaults appropriate for English, plus |
---|
445 | aggressive word-splitting and autophrase features enabled. |
---|
446 | This field is just like text_en, except it adds |
---|
447 | WordDelimiterFilter to enable splitting and matching of |
---|
448 | words on case-change, alpha numeric boundaries, and |
---|
449 | non-alphanumeric chars. This means certain compound word |
---|
450 | cases will work, for example query "wi fi" will match |
---|
451 | document "WiFi" or "wi-fi". |
---|
452 | --> |
---|
453 | <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
---|
454 | <analyzer type="index"> |
---|
455 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
456 | <!-- in this example, we will only use synonyms at query time |
---|
457 | <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
---|
458 | --> |
---|
459 | <!-- Case insensitive stop word removal. |
---|
460 | add enablePositionIncrements=true in both the index and query |
---|
461 | analyzers to leave a 'gap' for more accurate phrase queries. |
---|
462 | --> |
---|
463 | <filter class="solr.StopFilterFactory" |
---|
464 | ignoreCase="true" |
---|
465 | words="lang/stopwords_en.txt" |
---|
466 | enablePositionIncrements="true" |
---|
467 | /> |
---|
468 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> |
---|
469 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
470 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
471 | <filter class="solr.PorterStemFilterFactory"/> |
---|
472 | </analyzer> |
---|
473 | <analyzer type="query"> |
---|
474 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
475 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
476 | <filter class="solr.StopFilterFactory" |
---|
477 | ignoreCase="true" |
---|
478 | words="lang/stopwords_en.txt" |
---|
479 | enablePositionIncrements="true" |
---|
480 | /> |
---|
481 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
---|
482 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
483 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
484 | <filter class="solr.PorterStemFilterFactory"/> |
---|
485 | </analyzer> |
---|
486 | </fieldType> |
---|
487 | |
---|
488 | <!-- Less flexible matching, but less false matches. Probably not ideal for product names, |
---|
489 | but may be good for SKUs. Can insert dashes in the wrong place and still match. --> |
---|
490 | <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> |
---|
491 | <analyzer> |
---|
492 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
493 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> |
---|
494 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/> |
---|
495 | <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
---|
496 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
497 | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> |
---|
498 | <filter class="solr.EnglishMinimalStemFilterFactory"/> |
---|
499 | <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes |
---|
500 | possible with WordDelimiterFilter in conjuncton with stemming. --> |
---|
501 | <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
---|
502 | </analyzer> |
---|
503 | </fieldType> |
---|
504 | |
---|
505 | <!-- Just like text_general except it reverses the characters of |
---|
506 | each token, to enable more efficient leading wildcard queries. --> |
---|
507 | <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> |
---|
508 | <analyzer type="index"> |
---|
509 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
510 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
511 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
512 | <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" |
---|
513 | maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> |
---|
514 | </analyzer> |
---|
515 | <analyzer type="query"> |
---|
516 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
517 | <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
---|
518 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
---|
519 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
520 | </analyzer> |
---|
521 | </fieldType> |
---|
522 | |
---|
523 | <!-- charFilter + WhitespaceTokenizer --> |
---|
524 | <!-- |
---|
525 | <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" > |
---|
526 | <analyzer> |
---|
527 | <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/> |
---|
528 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
529 | </analyzer> |
---|
530 | </fieldType> |
---|
531 | --> |
---|
532 | |
---|
533 | <!-- This is an example of using the KeywordTokenizer along |
---|
534 | With various TokenFilterFactories to produce a sortable field |
---|
535 | that does not include some properties of the source text |
---|
536 | --> |
---|
537 | <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true"> |
---|
538 | <analyzer> |
---|
539 | <!-- KeywordTokenizer does no actual tokenizing, so the entire |
---|
540 | input string is preserved as a single token |
---|
541 | --> |
---|
542 | <tokenizer class="solr.KeywordTokenizerFactory"/> |
---|
543 | <!-- The LowerCase TokenFilter does what you expect, which can be |
---|
544 | when you want your sorting to be case insensitive |
---|
545 | --> |
---|
546 | <filter class="solr.LowerCaseFilterFactory" /> |
---|
547 | <!-- The TrimFilter removes any leading or trailing whitespace --> |
---|
548 | <filter class="solr.TrimFilterFactory" /> |
---|
549 | <!-- The PatternReplaceFilter gives you the flexibility to use |
---|
550 | Java Regular expression to replace any sequence of characters |
---|
551 | matching a pattern with an arbitrary replacement string, |
---|
552 | which may include back references to portions of the original |
---|
553 | string matched by the pattern. |
---|
554 | |
---|
555 | See the Java Regular Expression documentation for more |
---|
556 | information on pattern and replacement string syntax. |
---|
557 | |
---|
558 | http://java.sun.com/j2se/1.6.0/docs/api/java/util/regex/package-summary.html |
---|
559 | --> |
---|
560 | <filter class="solr.PatternReplaceFilterFactory" |
---|
561 | pattern="([^a-z])" replacement="" replace="all" |
---|
562 | /> |
---|
563 | </analyzer> |
---|
564 | </fieldType> |
---|
565 | |
---|
566 | <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > |
---|
567 | <analyzer> |
---|
568 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
569 | <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> |
---|
570 | </analyzer> |
---|
571 | </fieldtype> |
---|
572 | |
---|
573 | <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > |
---|
574 | <analyzer> |
---|
575 | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
---|
576 | <!-- |
---|
577 | The DelimitedPayloadTokenFilter can put payloads on tokens... for example, |
---|
578 | a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f |
---|
579 | Attributes of the DelimitedPayloadTokenFilterFactory : |
---|
580 | "delimiter" - a one character delimiter. Default is | (pipe) |
---|
581 | "encoder" - how to encode the following value into a playload |
---|
582 | float -> org.apache.lucene.analysis.payloads.FloatEncoder, |
---|
583 | integer -> o.a.l.a.p.IntegerEncoder |
---|
584 | identity -> o.a.l.a.p.IdentityEncoder |
---|
585 | Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. |
---|
586 | --> |
---|
587 | <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> |
---|
588 | </analyzer> |
---|
589 | </fieldtype> |
---|
590 | |
---|
591 | <!-- lowercases the entire field value, keeping it as a single token. --> |
---|
592 | <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> |
---|
593 | <analyzer> |
---|
594 | <tokenizer class="solr.KeywordTokenizerFactory"/> |
---|
595 | <filter class="solr.LowerCaseFilterFactory" /> |
---|
596 | </analyzer> |
---|
597 | </fieldType> |
---|
598 | |
---|
599 | <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> |
---|
600 | <analyzer> |
---|
601 | <tokenizer class="solr.PathHierarchyTokenizerFactory"/> |
---|
602 | </analyzer> |
---|
603 | </fieldType> |
---|
604 | |
---|
605 | |
---|
606 | <!-- since fields of this type are by default not stored or indexed, |
---|
607 | any data added to them will be ignored outright. --> |
---|
608 | <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> |
---|
609 | |
---|
610 | <!-- This point type indexes the coordinates as separate fields (subFields) |
---|
611 | If subFieldType is defined, it references a type, and a dynamic field |
---|
612 | definition is created matching *___<typename>. Alternately, if |
---|
613 | subFieldSuffix is defined, that is used to create the subFields. |
---|
614 | Example: if subFieldType="double", then the coordinates would be |
---|
615 | indexed in fields myloc_0___double,myloc_1___double. |
---|
616 | Example: if subFieldSuffix="_d" then the coordinates would be indexed |
---|
617 | in fields myloc_0_d,myloc_1_d |
---|
618 | The subFields are an implementation detail of the fieldType, and end |
---|
619 | users normally should not need to know about them. |
---|
620 | --> |
---|
621 | <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> |
---|
622 | |
---|
623 | <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. --> |
---|
624 | <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> |
---|
625 | |
---|
626 | <!-- |
---|
627 | A Geohash is a compact representation of a latitude longitude pair in a single field. |
---|
628 | See http://wiki.apache.org/solr/SpatialSearch |
---|
629 | --> |
---|
630 | <fieldtype name="geohash" class="solr.GeoHashField"/> |
---|
631 | |
---|
632 | <!-- Money/currency field type. See http://wiki.apache.org/solr/MoneyFieldType |
---|
633 | Parameters: |
---|
634 | defaultCurrency: Specifies the default currency if none specified. Defaults to "USD" |
---|
635 | precisionStep: Specifies the precisionStep for the TrieLong field used for the amount |
---|
636 | providerClass: Lets you plug in other exchange provider backend: |
---|
637 | solr.FileExchangeRateProvider is the default and takes one parameter: |
---|
638 | currencyConfig: name of an xml file holding exhange rates |
---|
639 | solr.OpenExchangeRatesOrgProvider uses rates from openexchangerates.org: |
---|
640 | ratesFileLocation: URL or path to rates JSON file (default latest.json on the web) |
---|
641 | refreshInterval: Number of minutes between each rates fetch (default: 1440, min: 60) |
---|
642 | --> |
---|
643 | <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" /> |
---|
644 | |
---|
645 | |
---|
646 | |
---|
647 | <!-- some examples for different languages (generally ordered by ISO code) --> |
---|
648 | |
---|
649 | <!-- Arabic --> |
---|
650 | <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100"> |
---|
651 | <analyzer> |
---|
652 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
653 | <!-- for any non-arabic --> |
---|
654 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
655 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/> |
---|
656 | <!-- normalizes ﻯ to ﻱ, etc --> |
---|
657 | <filter class="solr.ArabicNormalizationFilterFactory"/> |
---|
658 | <filter class="solr.ArabicStemFilterFactory"/> |
---|
659 | </analyzer> |
---|
660 | </fieldType> |
---|
661 | |
---|
662 | <!-- Bulgarian --> |
---|
663 | <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100"> |
---|
664 | <analyzer> |
---|
665 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
666 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
667 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/> |
---|
668 | <filter class="solr.BulgarianStemFilterFactory"/> |
---|
669 | </analyzer> |
---|
670 | </fieldType> |
---|
671 | |
---|
672 | <!-- Catalan --> |
---|
673 | <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100"> |
---|
674 | <analyzer> |
---|
675 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
676 | <!-- removes l', etc --> |
---|
677 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/> |
---|
678 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
679 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/> |
---|
680 | <filter class="solr.SnowballPorterFilterFactory" language="Catalan"/> |
---|
681 | </analyzer> |
---|
682 | </fieldType> |
---|
683 | |
---|
684 | <!-- CJK bigram (see text_ja for a Japanese configuration using morphological analysis) --> |
---|
685 | <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> |
---|
686 | <analyzer> |
---|
687 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
688 | <!-- normalize width before bigram, as e.g. half-width dakuten combine --> |
---|
689 | <filter class="solr.CJKWidthFilterFactory"/> |
---|
690 | <!-- for any non-CJK --> |
---|
691 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
692 | <filter class="solr.CJKBigramFilterFactory"/> |
---|
693 | </analyzer> |
---|
694 | </fieldType> |
---|
695 | |
---|
696 | <!-- Czech --> |
---|
697 | <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> |
---|
698 | <analyzer> |
---|
699 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
700 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
701 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/> |
---|
702 | <filter class="solr.CzechStemFilterFactory"/> |
---|
703 | </analyzer> |
---|
704 | </fieldType> |
---|
705 | |
---|
706 | <!-- Danish --> |
---|
707 | <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100"> |
---|
708 | <analyzer> |
---|
709 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
710 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
711 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/> |
---|
712 | <filter class="solr.SnowballPorterFilterFactory" language="Danish"/> |
---|
713 | </analyzer> |
---|
714 | </fieldType> |
---|
715 | |
---|
716 | <!-- German --> |
---|
717 | <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100"> |
---|
718 | <analyzer> |
---|
719 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
720 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
721 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/> |
---|
722 | <filter class="solr.GermanNormalizationFilterFactory"/> |
---|
723 | <filter class="solr.GermanLightStemFilterFactory"/> |
---|
724 | <!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> --> |
---|
725 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="German2"/> --> |
---|
726 | </analyzer> |
---|
727 | </fieldType> |
---|
728 | |
---|
729 | <!-- Greek --> |
---|
730 | <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100"> |
---|
731 | <analyzer> |
---|
732 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
733 | <!-- greek specific lowercase for sigma --> |
---|
734 | <filter class="solr.GreekLowerCaseFilterFactory"/> |
---|
735 | <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/> |
---|
736 | <filter class="solr.GreekStemFilterFactory"/> |
---|
737 | </analyzer> |
---|
738 | </fieldType> |
---|
739 | |
---|
740 | <!-- Spanish --> |
---|
741 | <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100"> |
---|
742 | <analyzer> |
---|
743 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
744 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
745 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/> |
---|
746 | <filter class="solr.SpanishLightStemFilterFactory"/> |
---|
747 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> --> |
---|
748 | </analyzer> |
---|
749 | </fieldType> |
---|
750 | |
---|
751 | <!-- Basque --> |
---|
752 | <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100"> |
---|
753 | <analyzer> |
---|
754 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
755 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
756 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/> |
---|
757 | <filter class="solr.SnowballPorterFilterFactory" language="Basque"/> |
---|
758 | </analyzer> |
---|
759 | </fieldType> |
---|
760 | |
---|
761 | <!-- Persian --> |
---|
762 | <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100"> |
---|
763 | <analyzer> |
---|
764 | <!-- for ZWNJ --> |
---|
765 | <charFilter class="solr.PersianCharFilterFactory"/> |
---|
766 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
767 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
768 | <filter class="solr.ArabicNormalizationFilterFactory"/> |
---|
769 | <filter class="solr.PersianNormalizationFilterFactory"/> |
---|
770 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/> |
---|
771 | </analyzer> |
---|
772 | </fieldType> |
---|
773 | |
---|
774 | <!-- Finnish --> |
---|
775 | <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100"> |
---|
776 | <analyzer> |
---|
777 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
778 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
779 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/> |
---|
780 | <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/> |
---|
781 | <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> --> |
---|
782 | </analyzer> |
---|
783 | </fieldType> |
---|
784 | |
---|
785 | <!-- French --> |
---|
786 | <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100"> |
---|
787 | <analyzer> |
---|
788 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
789 | <!-- removes l', etc --> |
---|
790 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/> |
---|
791 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
792 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/> |
---|
793 | <filter class="solr.FrenchLightStemFilterFactory"/> |
---|
794 | <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> --> |
---|
795 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> --> |
---|
796 | </analyzer> |
---|
797 | </fieldType> |
---|
798 | |
---|
799 | <!-- Irish --> |
---|
800 | <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100"> |
---|
801 | <analyzer> |
---|
802 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
803 | <!-- removes d', etc --> |
---|
804 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/> |
---|
805 | <!-- removes n-, etc. position increments is intentionally false! --> |
---|
806 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/> |
---|
807 | <filter class="solr.IrishLowerCaseFilterFactory"/> |
---|
808 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/> |
---|
809 | <filter class="solr.SnowballPorterFilterFactory" language="Irish"/> |
---|
810 | </analyzer> |
---|
811 | </fieldType> |
---|
812 | |
---|
813 | <!-- Galician --> |
---|
814 | <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100"> |
---|
815 | <analyzer> |
---|
816 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
817 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
818 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/> |
---|
819 | <filter class="solr.GalicianStemFilterFactory"/> |
---|
820 | <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> --> |
---|
821 | </analyzer> |
---|
822 | </fieldType> |
---|
823 | |
---|
824 | <!-- Hindi --> |
---|
825 | <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100"> |
---|
826 | <analyzer> |
---|
827 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
828 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
829 | <!-- normalizes unicode representation --> |
---|
830 | <filter class="solr.IndicNormalizationFilterFactory"/> |
---|
831 | <!-- normalizes variation in spelling --> |
---|
832 | <filter class="solr.HindiNormalizationFilterFactory"/> |
---|
833 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/> |
---|
834 | <filter class="solr.HindiStemFilterFactory"/> |
---|
835 | </analyzer> |
---|
836 | </fieldType> |
---|
837 | |
---|
838 | <!-- Hungarian --> |
---|
839 | <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100"> |
---|
840 | <analyzer> |
---|
841 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
842 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
843 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/> |
---|
844 | <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/> |
---|
845 | <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> --> |
---|
846 | </analyzer> |
---|
847 | </fieldType> |
---|
848 | |
---|
849 | <!-- Armenian --> |
---|
850 | <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100"> |
---|
851 | <analyzer> |
---|
852 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
853 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
854 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/> |
---|
855 | <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/> |
---|
856 | </analyzer> |
---|
857 | </fieldType> |
---|
858 | |
---|
859 | <!-- Indonesian --> |
---|
860 | <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100"> |
---|
861 | <analyzer> |
---|
862 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
863 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
864 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/> |
---|
865 | <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false --> |
---|
866 | <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/> |
---|
867 | </analyzer> |
---|
868 | </fieldType> |
---|
869 | |
---|
870 | <!-- Italian --> |
---|
871 | <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100"> |
---|
872 | <analyzer> |
---|
873 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
874 | <!-- removes l', etc --> |
---|
875 | <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/> |
---|
876 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
877 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/> |
---|
878 | <filter class="solr.ItalianLightStemFilterFactory"/> |
---|
879 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> --> |
---|
880 | </analyzer> |
---|
881 | </fieldType> |
---|
882 | |
---|
883 | <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming) |
---|
884 | |
---|
885 | NOTE: If you want to optimize search for precision, use default operator AND in your query |
---|
886 | parser config with <solrQueryParser defaultOperator="AND"/> further down in this file. Use |
---|
887 | OR if you would like to optimize for recall (default). |
---|
888 | --> |
---|
889 | <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> |
---|
890 | <analyzer> |
---|
891 | <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer) |
---|
892 | |
---|
893 | Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic |
---|
894 | is used to segment compounds into its parts and the compound itself is kept as synonym. |
---|
895 | |
---|
896 | Valid values for attribute mode are: |
---|
897 | normal: regular segmentation |
---|
898 | search: segmentation useful for search with synonyms compounds (default) |
---|
899 | extended: same as search mode, but unigrams unknown words (experimental) |
---|
900 | |
---|
901 | For some applications it might be good to use search mode for indexing and normal mode for |
---|
902 | queries to reduce recall and prevent parts of compounds from being matched and highlighted. |
---|
903 | Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query. |
---|
904 | |
---|
905 | Kuromoji also has a convenient user dictionary feature that allows overriding the statistical |
---|
906 | model with your own entries for segmentation, part-of-speech tags and readings without a need |
---|
907 | to specify weights. Notice that user dictionaries have not been subject to extensive testing. |
---|
908 | |
---|
909 | User dictionary attributes are: |
---|
910 | userDictionary: user dictionary filename |
---|
911 | userDictionaryEncoding: user dictionary encoding (default is UTF-8) |
---|
912 | |
---|
913 | See lang/userdict_ja.txt for a sample user dictionary file. |
---|
914 | |
---|
915 | See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. |
---|
916 | --> |
---|
917 | <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> |
---|
918 | <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> |
---|
919 | <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (èŸæžåœ¢) --> |
---|
920 | <filter class="solr.JapaneseBaseFormFilterFactory"/> |
---|
921 | <!-- Removes tokens with certain part-of-speech tags --> |
---|
922 | <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> |
---|
923 | <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> |
---|
924 | <filter class="solr.CJKWidthFilterFactory"/> |
---|
925 | <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> |
---|
926 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> |
---|
927 | <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> |
---|
928 | <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> |
---|
929 | <!-- Lower-cases romaji characters --> |
---|
930 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
931 | </analyzer> |
---|
932 | </fieldType> |
---|
933 | |
---|
934 | <!-- Latvian --> |
---|
935 | <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100"> |
---|
936 | <analyzer> |
---|
937 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
938 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
939 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/> |
---|
940 | <filter class="solr.LatvianStemFilterFactory"/> |
---|
941 | </analyzer> |
---|
942 | </fieldType> |
---|
943 | |
---|
944 | <!-- Dutch --> |
---|
945 | <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100"> |
---|
946 | <analyzer> |
---|
947 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
948 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
949 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/> |
---|
950 | <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/> |
---|
951 | <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/> |
---|
952 | </analyzer> |
---|
953 | </fieldType> |
---|
954 | |
---|
955 | <!-- Norwegian --> |
---|
956 | <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100"> |
---|
957 | <analyzer> |
---|
958 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
959 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
960 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/> |
---|
961 | <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/> |
---|
962 | <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> --> |
---|
963 | <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> --> |
---|
964 | </analyzer> |
---|
965 | </fieldType> |
---|
966 | |
---|
967 | <!-- Portuguese --> |
---|
968 | <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> |
---|
969 | <analyzer> |
---|
970 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
971 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
972 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/> |
---|
973 | <filter class="solr.PortugueseLightStemFilterFactory"/> |
---|
974 | <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
---|
975 | <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
---|
976 | <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> --> |
---|
977 | </analyzer> |
---|
978 | </fieldType> |
---|
979 | |
---|
980 | <!-- Romanian --> |
---|
981 | <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100"> |
---|
982 | <analyzer> |
---|
983 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
984 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
985 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/> |
---|
986 | <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/> |
---|
987 | </analyzer> |
---|
988 | </fieldType> |
---|
989 | |
---|
990 | <!-- Russian --> |
---|
991 | <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100"> |
---|
992 | <analyzer> |
---|
993 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
994 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
995 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/> |
---|
996 | <filter class="solr.SnowballPorterFilterFactory" language="Russian"/> |
---|
997 | <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> --> |
---|
998 | </analyzer> |
---|
999 | </fieldType> |
---|
1000 | |
---|
1001 | <!-- Swedish --> |
---|
1002 | <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100"> |
---|
1003 | <analyzer> |
---|
1004 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1005 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1006 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/> |
---|
1007 | <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/> |
---|
1008 | <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> --> |
---|
1009 | </analyzer> |
---|
1010 | </fieldType> |
---|
1011 | |
---|
1012 | <!-- Thai --> |
---|
1013 | <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100"> |
---|
1014 | <analyzer> |
---|
1015 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1016 | <filter class="solr.LowerCaseFilterFactory"/> |
---|
1017 | <filter class="solr.ThaiWordFilterFactory"/> |
---|
1018 | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/> |
---|
1019 | </analyzer> |
---|
1020 | </fieldType> |
---|
1021 | |
---|
1022 | <!-- Turkish --> |
---|
1023 | <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100"> |
---|
1024 | <analyzer> |
---|
1025 | <tokenizer class="solr.StandardTokenizerFactory"/> |
---|
1026 | <filter class="solr.TurkishLowerCaseFilterFactory"/> |
---|
1027 | <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/> |
---|
1028 | <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/> |
---|
1029 | </analyzer> |
---|
1030 | </fieldType> |
---|
1031 | </types> |
---|
1032 | |
---|
1033 | |
---|
1034 | |
---|
1035 | </schema> |
---|