Project

General

Profile

1
<?xml version="1.0" ?>
2
<!-- 
3
THE OFFICIAL DataONE Index Solr Schema definition file.  
4
This schema is copied into the dataone-cn-index buildout for deployment on cn nodes.
5

    
6
The Solr schema file. This file should be named "schema.xml" and
7
 should be in the conf directory under the solr home
8
 (i.e. ./solr/conf/schema.xml by default) 
9
 or located where the classloader for the Solr webapp can find it.
10

    
11
 For more information, on how to customize this file, please see...
12
 http://wiki.apache.org/solr/SchemaXml
13
-->
14

    
15
<schema name="dataone" version="1.1">
16
    <types>
17
        <!-- field type definitions. The "name" attribute is
18
   just a label to be used by field definitions.  The "class"
19
   attribute and any other attributes determine the real
20
   behavior of the fieldtype.  -->
21

    
22
        <!-- The StringField type is not analyzed, but indexed/stored verbatim  -->
23
        <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
24

    
25
        <!-- boolean type: "true" or "false" -->
26
        <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
27

    
28
        <!-- The optional sortMissingLast and sortMissingFirst attributes are
29
             currently supported on types that are sorted internally as a strings.
30
           - If sortMissingLast="true" then a sort on this field will cause documents
31
           without the field to come after documents with the field,
32
           regardless of the requested sort order (asc or desc).
33
           - If sortMissingFirst="true" then a sort on this field will cause documents
34
           without the field to come before documents with the field,
35
           regardless of the requested sort order.
36
           - If sortMissingLast="false" and sortMissingFirst="false" (the default),
37
           then default lucene sorting will be used which places docs without the field
38
           first in an ascending sort and last in a descending sort.
39
        -->
40

    
41
        <!-- numeric field types that store and index the text
42
value verbatim (and hence don't support range queries since the
43
lexicographic ordering isn't equal to the numeric ordering) -->
44
        <fieldtype name="integer" class="solr.IntField"/>
45
        <fieldtype name="long" class="solr.LongField"/>
46
        <fieldtype name="float" class="solr.FloatField"/>
47
        <fieldtype name="double" class="solr.DoubleField"/>
48

    
49
        <!-- Numeric field types that manipulate the value into
50
  a string value that isn't human readable in it's internal form,
51
  but with a lexicographic ordering the same as the numeric ordering
52
  so that range queries correctly work. -->
53
        <fieldtype name="sint" class="solr.SortableIntField" sortMissingLast="true"/>
54
        <fieldtype name="slong" class="solr.SortableLongField" sortMissingLast="true"/>
55
        <fieldtype name="sfloat" class="solr.SortableFloatField" sortMissingLast="true"/>
56
        <fieldtype name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true"/>
57

    
58
        <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
59
   is a more restricted form of the canonical representation of dateTime
60
   http://www.w3.org/TR/xmlschema-2/#dateTime
61
   The trailing "Z" designates UTC time and is mandatory.
62
   Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
63
   All other components are mandatory. -->
64
        <fieldtype name="date" class="solr.DateField" sortMissingLast="true"/>
65

    
66
        <!-- solr.TextField allows the specification of custom text analyzers
67
            specified as a tokenizer and a list of token filters. Different
68
            analyzers may be specified for indexing and querying.
69

    
70
            The optional positionIncrementGap puts space between multiple fields of
71
            this type on the same document, with the purpose of preventing false phrase
72
            matching across fields.
73

    
74
            For more info on customizing your analyzer chain, please see...
75
         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
76

    
77
        -->
78

    
79
        <!-- Standard analyzer commonly used by Lucene developers
80
        -->
81
        <!-- Standard analyzer commonly used by Lucene developers -->
82
<!--
83
         <fieldtype name="text_lu" class="solr.TextField" positionIncrementGap="100">
84
            <analyzer>
85
                <tokenizer class="solr.StandardTokenizerFactory"/>
86
                <filter class="solr.StandardFilterFactory"/>
87
                <filter class="solr.LowerCaseFilterFactory"/>
88
                <filter class="solr.StopFilterFactory"/>
89
                <filter class="solr.EnglishPorterFilterFactory"/>
90
            </analyzer>
91
        </fieldtype> 
92
-->
93
        <!-- One could also specify an existing Analyzer implementation in Java
94
             via the class attribute on the analyzer element:
95
        <fieldtype name="text_lu" class="solr.TextField">
96
          <analyzer class="org.apache.lucene.analysis.snowball.SnowballAnalyzer"/>
97
        </fieldType>
98
        -->
99

    
100
        <!-- A text field that only splits on whitespace for more exact matching -->
101
        <fieldtype name="text_ws" class="solr.TextField" positionIncrementGap="100">
102
            <analyzer>
103
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
104
            </analyzer>
105
        </fieldtype>
106

    
107
        <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
108
            <analyzer type="index">
109
          		<tokenizer class="solr.WhitespaceTokenizerFactory"/>
110
                <filter class="solr.WordDelimiterFilterFactory" 
111
                	generateWordParts="1"
112
                	catenateWords="1"
113
                	generateNumberParts="1"
114
                	splitOnNumerics="0"/>
115
                
116
                <filter class="solr.LowerCaseFilterFactory"/>
117
                <filter class="solr.StopFilterFactory" />
118
                <filter class="solr.PorterStemFilterFactory" />
119
                <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"/>
120
            </analyzer>
121
            <analyzer type="query">
122
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
123
                
124
                <filter class="solr.WordDelimiterFilterFactory" 
125
                	generateWordParts="1"
126
                	catenateWords="0"
127
                	generateNumberParts="1"
128
                	splitOnNumerics="0"/>
129
                
130
                <filter class="solr.LowerCaseFilterFactory"/>
131
                <filter class="solr.StopFilterFactory"/>
132
                <filter class="solr.PorterStemFilterFactory" />
133
            </analyzer>
134
        </fieldtype>
135

    
136
        <fieldtype name="text_no_token" class="solr.TextField" positionIncrementGap="100">
137
            <analyzer type="index">
138
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>    
139
                <filter class="solr.LowerCaseFilterFactory"/>
140
                <filter class="solr.StopFilterFactory" />
141
            </analyzer>
142
            <analyzer type="query">
143
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>                
144
                <filter class="solr.LowerCaseFilterFactory"/>
145
                <filter class="solr.StopFilterFactory"/>
146
            </analyzer>
147
        </fieldtype>
148

    
149
<!--
150
         <fieldtype name="text_all" class="solr.TextField" positionIncrementGap="100">
151
            <analyzer type="index">
152
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
153
                in this example, we will only use synonyms at query time
154
                <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
155
               
156
                <filter class="solr.StopFilterFactory" ignoreCase="true"/>
157
                <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
158
                        catenateWords="1" catenateNumbers="1" catenateAll="0"/>
159
                <filter class="solr.LowerCaseFilterFactory"/>
160
                <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
161
            </analyzer>
162
            <analyzer type="query">
163
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
164
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
165
                <filter class="solr.StopFilterFactory" ignoreCase="true"/>
166
                <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
167
                        catenateWords="0" catenateNumbers="0" catenateAll="0"/>
168
                <filter class="solr.LowerCaseFilterFactory"/>
169
                <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
170
            </analyzer>
171
        </fieldtype>
172
 -->
173

    
174
        <!-- Less flexible matching, but less false matches.  Probably not ideal for product names
175
but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
176
         <fieldtype name="textTight" class="solr.TextField" positionIncrementGap="100">
177
            <analyzer>
178
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
179
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
180
                <filter class="solr.StopFilterFactory" ignoreCase="true"/>
181
                <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0"
182
                        catenateWords="1" catenateNumbers="1" catenateAll="0"/>
183
                <filter class="solr.LowerCaseFilterFactory"/>
184
                <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
185
            </analyzer>
186
        </fieldtype> 
187
		
188
    </types>
189

    
190

    
191
<!-- ======================================================================= -->
192
    <fields>
193
        <!-- Valid attributes for fields:
194
            name: mandatory - the name for the field
195
            type: mandatory - the name of a previously defined type from the <types> section
196
            indexed: true if this field should be indexed (searchable)
197
            stored: true if this field should be retrievable
198
            multiValued: true if this field may contain multiple values per document
199
            omitNorms: (expert) set to true to omit the norms associated with this field
200
                       (this disables length normalization and index-time boosting for the field)
201
        -->
202

    
203
        <field name="id"              	type="string"   indexed="true" stored="true" multiValued="false"/>
204
        <field name="formatId"    		type="string"   indexed="true" stored="true" multiValued="false"/>
205
		<field name="formatType"		type="string"	indexed="true"	stored="true"	multiValued="false"/>
206
        <field name="size"            	type="slong"     indexed="true" stored="true" multiValued="false"/>
207
        <field name="checksum"        	type="string"   indexed="false" stored="true" multiValued="false"/>
208
        <field name="checksumAlgorithm" type="string" indexed="false" stored="true"/>
209
        <field name="dateUploaded"    type="date"     indexed="true" stored="true" multiValued="false"/>
210
        <field name="dateModified"    type="date"     indexed="true" stored="true" multiValued="false"/>
211
        <field name="submitter"       type="string"   indexed="true" stored="true" multiValued="false"/>
212
        <field name="rightsHolder"    type="string"   indexed="true" stored="true" multiValued="false"/>
213
        <field name="authoritativeMN"       	type="string"   indexed="false" stored="true" multiValued="false"/>
214
        <field name="replicationAllowed"     	type="boolean"  indexed="false" stored="true" multiValued="false"/>
215
        <field name="numberReplicas"      		type="integer"  indexed="false" stored="true" multiValued="false"/>
216
        <field name="preferredReplicationMN"   	type="string"   indexed="false" stored="true" multiValued="true"/>
217
        <field name="blockedReplicationMN"  	type="string"   indexed="false" stored="true" multiValued="true"/>
218
        <field name="replicaMN"      		type="string"   indexed="false" stored="true" multiValued="true"/>
219
        <field name="replicaVerifiedDate" 	type="date"    indexed="false" stored="true" multiValued="true"/>
220
        <field name="datasource"       		type="string"  indexed="true" stored="true" multiValued="false"/>
221
        <field name="obsoletes"       		type="string"   indexed="true" stored="true" multiValued="false"/>
222
        <field name="obsoletedBy"      		type="string"   indexed="true" stored="true" multiValued="false"/>
223

    
224
        <!-- Object relationships -->
225
        <field name="resourceMap"		type="string"   indexed="true" stored="true" multiValued="true"/>
226
        <field name="documents"			type="string"   indexed="true" stored="true" multiValued="true"/>
227
        <field name="isDocumentedBy"	type="string"   indexed="true" stored="true" multiValued="true"/>
228

    
229
        <!--Permissions-->
230
        <field name="readPermission"    type="string"   indexed="true" stored="true" multiValued="true"/>
231
        <field name="writePermission"   type="string"   indexed="true" stored="true" multiValued="true"/>
232
        <field name="changePermission"  type="string"   indexed="true" stored="true" multiValued="true"/>
233
        <field name="isPublic"          type="boolean"  indexed="true" stored="true" />
234
        
235
        <!-- Science metadata properties -->
236
        <field name="abstract"                 type="text"      multiValued="false" indexed="true" stored="true" />
237
        <field name="author"                   type="string"    multiValued="false" indexed="true" stored="true" />
238
        <field name="authorLastName"             type="string"    multiValued="true" indexed="true" stored="true" />
239
        <field name="keywords"                 type="string"    multiValued="true" indexed="true" stored="true" />
240
        <field name="keyConcept"               type="string"    multiValued="true" indexed="true" stored="true" />
241
        <field name="southBoundCoord"          type="sfloat"     multiValued="false" indexed="true" stored="true"/>
242
        <field name="northBoundCoord"          type="sfloat"     multiValued="false" indexed="true" stored="true"/>
243
        <field name="westBoundCoord"           type="sfloat"     multiValued="false" indexed="true" stored="true"/>
244
        <field name="eastBoundCoord"           type="sfloat"     multiValued="false" indexed="true" stored="true"/>
245
        <field name="namedLocation"            type="string"    multiValued="true" indexed="true" stored="true" />
246
        <field name="beginDate"                type="date"      multiValued="false" indexed="true" stored="true" />
247
        <field name="endDate"                  type="date"      multiValued="false" indexed="true" stored="true" />
248
        
249
        <field name="title"                    type="text"    multiValued="false" indexed="true" stored="true" />
250
        <field name="scientificName"           type="string"    multiValued="true" indexed="true" stored="true" />
251
        <field name="relatedOrganizations"     type="string"    multiValued="true" indexed="true" stored="true" />
252
        <field name="datePublished"            type="date"      multiValued="false" indexed="true" stored="true" />
253
        <field name="pubDate" 				   type="date" 							indexed="true" stored="true"/>
254

    
255
		<field name="investigator" 		type="string" 	indexed="true" stored="true" multiValued="true"/>
256
		<field name="investigatorText" 	type="text" 	indexed="true" stored="false" multiValued="true"/>
257
        <field name="ogcUrl" 			type="text" 	indexed="false" stored="true"/>
258
        <field name="sku" 				type="textTight" indexed="true" stored="true"/>
259
        <field name="identifier" 		type="textTight" indexed="true" stored="true"/>
260
        <field name="LTERSite"	 		type="string" 	indexed="true" stored="true"/>
261
        <field name="origin" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
262
        <field name="originText" 		type="text" 	indexed="true" stored="false" multiValued="true"/>
263
        <field name="titlestr" 			type="string" 	indexed="true" stored="false"/>
264
        <field name="geoform" 			type="string" 	indexed="true" stored="true"/>
265
        <field name="presentationCat" 	type="string" 	indexed="true" stored="true"/>
266
        <field name="purpose" 			type="text" 	indexed="true" stored="true"/>
267
        <field name="updateDate" 		type="date" 	indexed="true" stored="true"/>
268
        <field name="edition" 			type="text" 	indexed="true" stored="true"/>
269
        <field name="dataUrl" 			type="string" 	indexed="false" stored="true"/>
270
        <field name="originator" 		type="string" 	indexed="true" stored="true" multiValued="true"/>
271
        <field name="originatorText" 	type="text"	  	indexed="true" stored="false" multiValued="true"/>
272
        <field name="family" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
273
        <field name="species" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
274
        <field name="genus" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
275
        <field name="kingdom" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
276
        <field name="phylum" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
277
        <field name="order" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
278
        <field name="class" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
279

    
280
        <field name="webUrl" 			type="string" 	indexed="false" stored="true" multiValued="true"/>
281

    
282
        <field name="contactOrganization" 		type="string" indexed="true" stored="true" multiValued="true"/>
283
        <field name="contactOrganizationText" 	type="text"   indexed="true" stored="false" multiValued="true"/>
284

    
285
        <field name="keywordsText" 		type="text" 	indexed="true" stored="false" multiValued="true"/>
286
        <field name="placeKey" 			type="text" 	indexed="true" stored="true" multiValued="true"/>
287

    
288
        <field name="noBoundingBox" 	type="string" 	indexed="true" stored="true"/>
289
        <field name="isSpatial" 		type="string" 	indexed="true" stored="true"/>
290

    
291
        <field name="decade" 			type="string" 	indexed="true" stored="true"/>
292
        <field name="gcmdKeyword" 		type="text" 	indexed="true" stored="true" multiValued="true"/>
293

    
294
        <!-- these are ornl daac fields, may not be populated for nbii, but are required to be here for indexing purpose-->
295
        <field name="project" 			type="string" 	indexed="true" stored="true"/>
296
        <field name="projectText" 		type="text" 	indexed="true" stored="false"/>
297

    
298
        <field name="site" 				type="string" 	indexed="true" stored="true" multiValued="true"/>
299
        <field name="siteText" 			type="text" 	indexed="true" stored="false" multiValued="true"/>
300

    
301
        <field name="parameter" 		type="string" 	indexed="true" stored="true" multiValued="true"/>
302
        <field name="parameterText"		type="text" 	indexed="true" stored="false" multiValued="true"/>
303

    
304
        <field name="sensor" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
305
        <field name="sensorText" 		type="text" 	indexed="true" stored="false" multiValued="true"/>
306

    
307
        <field name="source" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
308
        <field name="sourceText" 		type="text" 	indexed="true" stored="false" multiValued="true"/>
309

    
310
        <field name="term" 				type="string" 	indexed="true" stored="true" multiValued="true"/>
311
        <field name="termText" 			type="text" 	indexed="true" stored="false" multiValued="true"/>
312

    
313
        <field name="topic" 			type="string" 	indexed="true" stored="true" multiValued="true"/>
314
        <field name="topicText" 		type="text" 	indexed="true" stored="false" multiValued="true"/>
315

    
316
        <field name="fileID" 			type="string" 	indexed="true" stored="true"/>
317
   		<field name="text"				type="text"		indexed="true" stored="true" multiValued="false" />
318
<!-- ======================================================================= -->
319
        <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
320
will be used if the name matches any of the patterns.
321
RESTRICTION: the glob-like pattern in the name attribute must have
322
a "*" only at the start or the end.
323
EXAMPLE:  name="*_i" will match any field ending in _i (like myid_i, z_i)
324
Longer patterns will be matched first.  if equal size patterns
325
both match, the first appearing in the schema will be used.  -->
326
        <dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
327
        <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
328
        <dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
329
        <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
330
        <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
331
        <dynamicField name="*_f" type="sfloat" indexed="true" stored="true"/>
332
        <dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
333
        <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
334
    </fields>
335

    
336
<!-- ======================================================================= -->
337
    <!-- field to use to determine and enforce document uniqueness. -->
338
    <uniqueKey>id</uniqueKey>
339

    
340
    <!-- field for the QueryParser to use when an explicit fieldname is absent -->
341
    <defaultSearchField>text</defaultSearchField>
342

    
343
    <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
344
    <solrQueryParser defaultOperator="AND"/>
345

    
346
<!-- ======================================================================= -->
347
    <!-- copyField commands copy one field to another at the time a document
348
is added to the index.  It's used either to index the same field different
349
ways, or to add multiple fields to the same field for easier/faster searching.  -->
350

    
351
    <copyField source="id" dest="sku"/>
352
    <copyField source="id" dest="identifier"/>
353
    <copyfield source="replicaMN" dest="datasource" />
354

    
355
    <copyField source="title" dest="titlestr"/>
356

    
357
    <copyField source="origin" dest="originText"/>
358
    <copyField source="origin" dest="originatorText"/>
359
    
360
    <copyField source="project" dest="projectText"/>
361
    
362
    <copyField source="site" dest="siteText"/>
363
    <copyField source="parameter" dest="parameterText"/>
364
    <copyField source="sensor" dest="sensorText"/>
365
    <copyField source="source" dest="sourceText"/>
366
    <copyField source="term" dest="termText"/>
367
    <copyField source="topic" dest="topicText"/>
368
    <copyField source="investigator" dest="investigatorText"/>
369
    <copyField source="keywords" dest="keywordsText"/>
370
    <copyField source="pubDate" dest="datePublished" />
371
    <copyField source="dateUploaded" dest="updateDate" />
372
    <copyField source="contactOrganization" dest="contactOrganizationText"/>
373
  
374

    
375
    <!-- Similarity is the scoring routine for each document vs a query.
376
A custom similarity may be specified here, but the default is fine
377
for most applications.  -->
378
    <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
379

    
380
</schema>
(1-1/2)