1
|
<?xml version="1.0" ?>
|
2
|
<!--
|
3
|
THE OFFICIAL DataONE Index Solr Schema definition file.
|
4
|
This schema is copied into the dataone-cn-index buildout for deployment on cn nodes.
|
5
|
|
6
|
The Solr schema file. This file should be named "schema.xml" and
|
7
|
should be in the conf directory under the solr home
|
8
|
(i.e. ./solr/conf/schema.xml by default)
|
9
|
or located where the classloader for the Solr webapp can find it.
|
10
|
|
11
|
For more information, on how to customize this file, please see...
|
12
|
http://wiki.apache.org/solr/SchemaXml
|
13
|
-->
|
14
|
|
15
|
<schema name="dataone" version="1.1">
|
16
|
<types>
|
17
|
<!-- field type definitions. The "name" attribute is
|
18
|
just a label to be used by field definitions. The "class"
|
19
|
attribute and any other attributes determine the real
|
20
|
behavior of the fieldtype. -->
|
21
|
|
22
|
<!-- The StringField type is not analyzed, but indexed/stored verbatim -->
|
23
|
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
|
24
|
|
25
|
<!-- boolean type: "true" or "false" -->
|
26
|
<fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
27
|
|
28
|
<!-- The optional sortMissingLast and sortMissingFirst attributes are
|
29
|
currently supported on types that are sorted internally as a strings.
|
30
|
- If sortMissingLast="true" then a sort on this field will cause documents
|
31
|
without the field to come after documents with the field,
|
32
|
regardless of the requested sort order (asc or desc).
|
33
|
- If sortMissingFirst="true" then a sort on this field will cause documents
|
34
|
without the field to come before documents with the field,
|
35
|
regardless of the requested sort order.
|
36
|
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
|
37
|
then default lucene sorting will be used which places docs without the field
|
38
|
first in an ascending sort and last in a descending sort.
|
39
|
-->
|
40
|
|
41
|
<!-- numeric field types that store and index the text
|
42
|
value verbatim (and hence don't support range queries since the
|
43
|
lexicographic ordering isn't equal to the numeric ordering) -->
|
44
|
<fieldtype name="integer" class="solr.IntField"/>
|
45
|
<fieldtype name="long" class="solr.LongField"/>
|
46
|
<fieldtype name="float" class="solr.FloatField"/>
|
47
|
<fieldtype name="double" class="solr.DoubleField"/>
|
48
|
|
49
|
<!-- Numeric field types that manipulate the value into
|
50
|
a string value that isn't human readable in it's internal form,
|
51
|
but with a lexicographic ordering the same as the numeric ordering
|
52
|
so that range queries correctly work. -->
|
53
|
<fieldtype name="sint" class="solr.SortableIntField" sortMissingLast="true"/>
|
54
|
<fieldtype name="slong" class="solr.SortableLongField" sortMissingLast="true"/>
|
55
|
<fieldtype name="sfloat" class="solr.SortableFloatField" sortMissingLast="true"/>
|
56
|
<fieldtype name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true"/>
|
57
|
|
58
|
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
59
|
is a more restricted form of the canonical representation of dateTime
|
60
|
http://www.w3.org/TR/xmlschema-2/#dateTime
|
61
|
The trailing "Z" designates UTC time and is mandatory.
|
62
|
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
|
63
|
All other components are mandatory. -->
|
64
|
<fieldtype name="date" class="solr.DateField" sortMissingLast="true"/>
|
65
|
|
66
|
<!-- solr.TextField allows the specification of custom text analyzers
|
67
|
specified as a tokenizer and a list of token filters. Different
|
68
|
analyzers may be specified for indexing and querying.
|
69
|
|
70
|
The optional positionIncrementGap puts space between multiple fields of
|
71
|
this type on the same document, with the purpose of preventing false phrase
|
72
|
matching across fields.
|
73
|
|
74
|
For more info on customizing your analyzer chain, please see...
|
75
|
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
76
|
|
77
|
-->
|
78
|
|
79
|
<!-- Standard analyzer commonly used by Lucene developers
|
80
|
-->
|
81
|
<!-- Standard analyzer commonly used by Lucene developers -->
|
82
|
<!--
|
83
|
<fieldtype name="text_lu" class="solr.TextField" positionIncrementGap="100">
|
84
|
<analyzer>
|
85
|
<tokenizer class="solr.StandardTokenizerFactory"/>
|
86
|
<filter class="solr.StandardFilterFactory"/>
|
87
|
<filter class="solr.LowerCaseFilterFactory"/>
|
88
|
<filter class="solr.StopFilterFactory"/>
|
89
|
<filter class="solr.EnglishPorterFilterFactory"/>
|
90
|
</analyzer>
|
91
|
</fieldtype>
|
92
|
-->
|
93
|
<!-- One could also specify an existing Analyzer implementation in Java
|
94
|
via the class attribute on the analyzer element:
|
95
|
<fieldtype name="text_lu" class="solr.TextField">
|
96
|
<analyzer class="org.apache.lucene.analysis.snowball.SnowballAnalyzer"/>
|
97
|
</fieldType>
|
98
|
-->
|
99
|
|
100
|
<!-- A text field that only splits on whitespace for more exact matching -->
|
101
|
<fieldtype name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
102
|
<analyzer>
|
103
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
104
|
</analyzer>
|
105
|
</fieldtype>
|
106
|
|
107
|
<fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
108
|
<analyzer type="index">
|
109
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
110
|
<filter class="solr.WordDelimiterFilterFactory"
|
111
|
generateWordParts="1"
|
112
|
catenateWords="1"
|
113
|
generateNumberParts="1"
|
114
|
splitOnNumerics="0"/>
|
115
|
|
116
|
<filter class="solr.LowerCaseFilterFactory"/>
|
117
|
<filter class="solr.StopFilterFactory" />
|
118
|
<filter class="solr.PorterStemFilterFactory" />
|
119
|
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"/>
|
120
|
</analyzer>
|
121
|
<analyzer type="query">
|
122
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
123
|
|
124
|
<filter class="solr.WordDelimiterFilterFactory"
|
125
|
generateWordParts="1"
|
126
|
catenateWords="0"
|
127
|
generateNumberParts="1"
|
128
|
splitOnNumerics="0"/>
|
129
|
|
130
|
<filter class="solr.LowerCaseFilterFactory"/>
|
131
|
<filter class="solr.StopFilterFactory"/>
|
132
|
<filter class="solr.PorterStemFilterFactory" />
|
133
|
</analyzer>
|
134
|
</fieldtype>
|
135
|
|
136
|
<fieldtype name="text_no_token" class="solr.TextField" positionIncrementGap="100">
|
137
|
<analyzer type="index">
|
138
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
139
|
<filter class="solr.LowerCaseFilterFactory"/>
|
140
|
<filter class="solr.StopFilterFactory" />
|
141
|
</analyzer>
|
142
|
<analyzer type="query">
|
143
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
144
|
<filter class="solr.LowerCaseFilterFactory"/>
|
145
|
<filter class="solr.StopFilterFactory"/>
|
146
|
</analyzer>
|
147
|
</fieldtype>
|
148
|
|
149
|
<!--
|
150
|
<fieldtype name="text_all" class="solr.TextField" positionIncrementGap="100">
|
151
|
<analyzer type="index">
|
152
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
153
|
in this example, we will only use synonyms at query time
|
154
|
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
155
|
|
156
|
<filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
157
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
|
158
|
catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
159
|
<filter class="solr.LowerCaseFilterFactory"/>
|
160
|
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
161
|
</analyzer>
|
162
|
<analyzer type="query">
|
163
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
164
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
165
|
<filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
166
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
|
167
|
catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
168
|
<filter class="solr.LowerCaseFilterFactory"/>
|
169
|
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
170
|
</analyzer>
|
171
|
</fieldtype>
|
172
|
-->
|
173
|
|
174
|
<!-- Less flexible matching, but less false matches. Probably not ideal for product names
|
175
|
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
|
176
|
<fieldtype name="textTight" class="solr.TextField" positionIncrementGap="100">
|
177
|
<analyzer>
|
178
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
179
|
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
180
|
<filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
181
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0"
|
182
|
catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
183
|
<filter class="solr.LowerCaseFilterFactory"/>
|
184
|
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
185
|
</analyzer>
|
186
|
</fieldtype>
|
187
|
|
188
|
</types>
|
189
|
|
190
|
|
191
|
<!-- ======================================================================= -->
|
192
|
<fields>
|
193
|
<!-- Valid attributes for fields:
|
194
|
name: mandatory - the name for the field
|
195
|
type: mandatory - the name of a previously defined type from the <types> section
|
196
|
indexed: true if this field should be indexed (searchable)
|
197
|
stored: true if this field should be retrievable
|
198
|
multiValued: true if this field may contain multiple values per document
|
199
|
omitNorms: (expert) set to true to omit the norms associated with this field
|
200
|
(this disables length normalization and index-time boosting for the field)
|
201
|
-->
|
202
|
|
203
|
<field name="id" type="string" indexed="true" stored="true" multiValued="false"/>
|
204
|
<field name="formatId" type="string" indexed="true" stored="true" multiValued="false"/>
|
205
|
<field name="formatType" type="string" indexed="true" stored="true" multiValued="false"/>
|
206
|
<field name="size" type="slong" indexed="true" stored="true" multiValued="false"/>
|
207
|
<field name="checksum" type="string" indexed="false" stored="true" multiValued="false"/>
|
208
|
<field name="checksumAlgorithm" type="string" indexed="false" stored="true"/>
|
209
|
<field name="dateUploaded" type="date" indexed="true" stored="true" multiValued="false"/>
|
210
|
<field name="dateModified" type="date" indexed="true" stored="true" multiValued="false"/>
|
211
|
<field name="submitter" type="string" indexed="true" stored="true" multiValued="false"/>
|
212
|
<field name="rightsHolder" type="string" indexed="true" stored="true" multiValued="false"/>
|
213
|
<field name="authoritativeMN" type="string" indexed="false" stored="true" multiValued="false"/>
|
214
|
<field name="replicationAllowed" type="boolean" indexed="false" stored="true" multiValued="false"/>
|
215
|
<field name="numberReplicas" type="integer" indexed="false" stored="true" multiValued="false"/>
|
216
|
<field name="preferredReplicationMN" type="string" indexed="false" stored="true" multiValued="true"/>
|
217
|
<field name="blockedReplicationMN" type="string" indexed="false" stored="true" multiValued="true"/>
|
218
|
<field name="replicaMN" type="string" indexed="false" stored="true" multiValued="true"/>
|
219
|
<field name="replicaVerifiedDate" type="date" indexed="false" stored="true" multiValued="true"/>
|
220
|
<field name="datasource" type="string" indexed="true" stored="true" multiValued="false"/>
|
221
|
<field name="obsoletes" type="string" indexed="true" stored="true" multiValued="false"/>
|
222
|
<field name="obsoletedBy" type="string" indexed="true" stored="true" multiValued="false"/>
|
223
|
|
224
|
<!-- Object relationships -->
|
225
|
<field name="resourceMap" type="string" indexed="true" stored="true" multiValued="true"/>
|
226
|
<field name="documents" type="string" indexed="true" stored="true" multiValued="true"/>
|
227
|
<field name="isDocumentedBy" type="string" indexed="true" stored="true" multiValued="true"/>
|
228
|
|
229
|
<!--Permissions-->
|
230
|
<field name="readPermission" type="string" indexed="true" stored="true" multiValued="true"/>
|
231
|
<field name="writePermission" type="string" indexed="true" stored="true" multiValued="true"/>
|
232
|
<field name="changePermission" type="string" indexed="true" stored="true" multiValued="true"/>
|
233
|
<field name="isPublic" type="boolean" indexed="true" stored="true" />
|
234
|
|
235
|
<!-- Science metadata properties -->
|
236
|
<field name="abstract" type="text" multiValued="false" indexed="true" stored="true" />
|
237
|
<field name="author" type="string" multiValued="false" indexed="true" stored="true" />
|
238
|
<field name="authorLastName" type="string" multiValued="true" indexed="true" stored="true" />
|
239
|
<field name="keywords" type="string" multiValued="true" indexed="true" stored="true" />
|
240
|
<field name="keyConcept" type="string" multiValued="true" indexed="true" stored="true" />
|
241
|
<field name="southBoundCoord" type="sfloat" multiValued="false" indexed="true" stored="true"/>
|
242
|
<field name="northBoundCoord" type="sfloat" multiValued="false" indexed="true" stored="true"/>
|
243
|
<field name="westBoundCoord" type="sfloat" multiValued="false" indexed="true" stored="true"/>
|
244
|
<field name="eastBoundCoord" type="sfloat" multiValued="false" indexed="true" stored="true"/>
|
245
|
<field name="namedLocation" type="string" multiValued="true" indexed="true" stored="true" />
|
246
|
<field name="beginDate" type="date" multiValued="false" indexed="true" stored="true" />
|
247
|
<field name="endDate" type="date" multiValued="false" indexed="true" stored="true" />
|
248
|
|
249
|
<field name="title" type="text" multiValued="false" indexed="true" stored="true" />
|
250
|
<field name="scientificName" type="string" multiValued="true" indexed="true" stored="true" />
|
251
|
<field name="relatedOrganizations" type="string" multiValued="true" indexed="true" stored="true" />
|
252
|
<field name="datePublished" type="date" multiValued="false" indexed="true" stored="true" />
|
253
|
<field name="pubDate" type="date" indexed="true" stored="true"/>
|
254
|
|
255
|
<field name="investigator" type="string" indexed="true" stored="true" multiValued="true"/>
|
256
|
<field name="investigatorText" type="text" indexed="true" stored="false" multiValued="true"/>
|
257
|
<field name="ogcUrl" type="text" indexed="false" stored="true"/>
|
258
|
<field name="sku" type="textTight" indexed="true" stored="true"/>
|
259
|
<field name="identifier" type="textTight" indexed="true" stored="true"/>
|
260
|
<field name="LTERSite" type="string" indexed="true" stored="true"/>
|
261
|
<field name="origin" type="string" indexed="true" stored="true" multiValued="true"/>
|
262
|
<field name="originText" type="text" indexed="true" stored="false" multiValued="true"/>
|
263
|
<field name="titlestr" type="string" indexed="true" stored="false"/>
|
264
|
<field name="geoform" type="string" indexed="true" stored="true"/>
|
265
|
<field name="presentationCat" type="string" indexed="true" stored="true"/>
|
266
|
<field name="purpose" type="text" indexed="true" stored="true"/>
|
267
|
<field name="updateDate" type="date" indexed="true" stored="true"/>
|
268
|
<field name="edition" type="text" indexed="true" stored="true"/>
|
269
|
<field name="dataUrl" type="string" indexed="false" stored="true"/>
|
270
|
<field name="originator" type="string" indexed="true" stored="true" multiValued="true"/>
|
271
|
<field name="originatorText" type="text" indexed="true" stored="false" multiValued="true"/>
|
272
|
<field name="family" type="string" indexed="true" stored="true" multiValued="true"/>
|
273
|
<field name="species" type="string" indexed="true" stored="true" multiValued="true"/>
|
274
|
<field name="genus" type="string" indexed="true" stored="true" multiValued="true"/>
|
275
|
<field name="kingdom" type="string" indexed="true" stored="true" multiValued="true"/>
|
276
|
<field name="phylum" type="string" indexed="true" stored="true" multiValued="true"/>
|
277
|
<field name="order" type="string" indexed="true" stored="true" multiValued="true"/>
|
278
|
<field name="class" type="string" indexed="true" stored="true" multiValued="true"/>
|
279
|
|
280
|
<field name="webUrl" type="string" indexed="false" stored="true" multiValued="true"/>
|
281
|
|
282
|
<field name="contactOrganization" type="string" indexed="true" stored="true" multiValued="true"/>
|
283
|
<field name="contactOrganizationText" type="text" indexed="true" stored="false" multiValued="true"/>
|
284
|
|
285
|
<field name="keywordsText" type="text" indexed="true" stored="false" multiValued="true"/>
|
286
|
<field name="placeKey" type="text" indexed="true" stored="true" multiValued="true"/>
|
287
|
|
288
|
<field name="noBoundingBox" type="string" indexed="true" stored="true"/>
|
289
|
<field name="isSpatial" type="string" indexed="true" stored="true"/>
|
290
|
|
291
|
<field name="decade" type="string" indexed="true" stored="true"/>
|
292
|
<field name="gcmdKeyword" type="text" indexed="true" stored="true" multiValued="true"/>
|
293
|
|
294
|
<!-- these are ornl daac fields, may not be populated for nbii, but are required to be here for indexing purpose-->
|
295
|
<field name="project" type="string" indexed="true" stored="true"/>
|
296
|
<field name="projectText" type="text" indexed="true" stored="false"/>
|
297
|
|
298
|
<field name="site" type="string" indexed="true" stored="true" multiValued="true"/>
|
299
|
<field name="siteText" type="text" indexed="true" stored="false" multiValued="true"/>
|
300
|
|
301
|
<field name="parameter" type="string" indexed="true" stored="true" multiValued="true"/>
|
302
|
<field name="parameterText" type="text" indexed="true" stored="false" multiValued="true"/>
|
303
|
|
304
|
<field name="sensor" type="string" indexed="true" stored="true" multiValued="true"/>
|
305
|
<field name="sensorText" type="text" indexed="true" stored="false" multiValued="true"/>
|
306
|
|
307
|
<field name="source" type="string" indexed="true" stored="true" multiValued="true"/>
|
308
|
<field name="sourceText" type="text" indexed="true" stored="false" multiValued="true"/>
|
309
|
|
310
|
<field name="term" type="string" indexed="true" stored="true" multiValued="true"/>
|
311
|
<field name="termText" type="text" indexed="true" stored="false" multiValued="true"/>
|
312
|
|
313
|
<field name="topic" type="string" indexed="true" stored="true" multiValued="true"/>
|
314
|
<field name="topicText" type="text" indexed="true" stored="false" multiValued="true"/>
|
315
|
|
316
|
<field name="fileID" type="string" indexed="true" stored="true"/>
|
317
|
<field name="text" type="text" indexed="true" stored="true" multiValued="false" />
|
318
|
<!-- ======================================================================= -->
|
319
|
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
|
320
|
will be used if the name matches any of the patterns.
|
321
|
RESTRICTION: the glob-like pattern in the name attribute must have
|
322
|
a "*" only at the start or the end.
|
323
|
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
|
324
|
Longer patterns will be matched first. if equal size patterns
|
325
|
both match, the first appearing in the schema will be used. -->
|
326
|
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
|
327
|
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
|
328
|
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
|
329
|
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
|
330
|
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
331
|
<dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
|
332
|
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
|
333
|
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
334
|
</fields>
|
335
|
|
336
|
<!-- ======================================================================= -->
|
337
|
<!-- field to use to determine and enforce document uniqueness. -->
|
338
|
<uniqueKey>id</uniqueKey>
|
339
|
|
340
|
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
341
|
<defaultSearchField>text</defaultSearchField>
|
342
|
|
343
|
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
|
344
|
<solrQueryParser defaultOperator="AND"/>
|
345
|
|
346
|
<!-- ======================================================================= -->
|
347
|
<!-- copyField commands copy one field to another at the time a document
|
348
|
is added to the index. It's used either to index the same field different
|
349
|
ways, or to add multiple fields to the same field for easier/faster searching. -->
|
350
|
|
351
|
<copyField source="id" dest="sku"/>
|
352
|
<copyField source="id" dest="identifier"/>
|
353
|
<copyfield source="replicaMN" dest="datasource" />
|
354
|
|
355
|
<copyField source="title" dest="titlestr"/>
|
356
|
|
357
|
<copyField source="origin" dest="originText"/>
|
358
|
<copyField source="origin" dest="originatorText"/>
|
359
|
|
360
|
<copyField source="project" dest="projectText"/>
|
361
|
|
362
|
<copyField source="site" dest="siteText"/>
|
363
|
<copyField source="parameter" dest="parameterText"/>
|
364
|
<copyField source="sensor" dest="sensorText"/>
|
365
|
<copyField source="source" dest="sourceText"/>
|
366
|
<copyField source="term" dest="termText"/>
|
367
|
<copyField source="topic" dest="topicText"/>
|
368
|
<copyField source="investigator" dest="investigatorText"/>
|
369
|
<copyField source="keywords" dest="keywordsText"/>
|
370
|
<copyField source="pubDate" dest="datePublished" />
|
371
|
<copyField source="dateUploaded" dest="updateDate" />
|
372
|
<copyField source="contactOrganization" dest="contactOrganizationText"/>
|
373
|
|
374
|
|
375
|
<!-- Similarity is the scoring routine for each document vs a query.
|
376
|
A custom similarity may be specified here, but the default is fine
|
377
|
for most applications. -->
|
378
|
<!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
|
379
|
|
380
|
</schema>
|