1
|
.. raw:: latex
|
2
|
|
3
|
\newpage
|
4
|
|
5
|
|
6
|
Metacat Indexing
|
7
|
===========================
|
8
|
Metacat v2.1 introduces support for building a SOLR index of Metacat content.
|
9
|
While we continue to support the "pathquery" search mechanism, this will be phased out
|
10
|
in favor of the more efficient SOLR query interface.
|
11
|
|
12
|
|
13
|
Metacat deployments that opt to use the Metacat SOLR index will be able to take advantage
|
14
|
of:
|
15
|
|
16
|
* fast search performance
|
17
|
* built-in paging features
|
18
|
* customizable return formats (for advanced admins)
|
19
|
|
20
|
Indexed documents and fields
|
21
|
-----------------------------
|
22
|
Metacat integrates the existing DataONE index library which includes many common metadata formats
|
23
|
out-of-the-box:
|
24
|
|
25
|
1. EML
|
26
|
2. FGDC
|
27
|
3. Dryad*
|
28
|
|
29
|
|
30
|
Default indexed fields
|
31
|
-----------------------
|
32
|
For a complete listing of the indexed fields, please see the DataONE documentation.
|
33
|
|
34
|
http://mule1.dataone.org/ArchitectureDocs-current/design/SearchMetadata.html
|
35
|
|
36
|
Metacat also reports on the currently-indexed fields, simply navigate to:
|
37
|
|
38
|
http://mule1.dataone.org/ArchitectureDocs-current/apis/MN_APIs.html#MNQuery.getQueryEngineDescription
|
39
|
|
40
|
with "solr" as the engine.
|
41
|
|
42
|
Index configuration
|
43
|
----------------------------
|
44
|
Metacat-index is deployed as a separate web application (metacat-index.war) and should be deployed
|
45
|
as a sibling of the Metacat webapp (metacat.war). Deploying metacat-index.war is only required when SOLR support
|
46
|
is desired (e.g., for MetacatUI) and can safely be omitted if it will not be utilized for any given Metacat deployment.
|
47
|
|
48
|
|
49
|
During the initial installation/upgrade, an empty index will be initialized in the configured "solr-home" location.
|
50
|
Metacat-index will index all the existing Metacat content when the webapp next initializes.
|
51
|
Note: the configured solr-home directory should not exist before configuring Metacat with indexing for the first time,
|
52
|
otherwise the blank index will not be created for metacat-index to utilize.
|
53
|
|
54
|
Additional advanced configuration options are available in the metacat.properties file (shared between Metacat and Metacat-index).
|
55
|
|
56
|
|
57
|
Adding additional document types and fields
|
58
|
--------------------------------------------
|
59
|
TBD: Step-by-step guide for adding new documents and indexed fields.
|
60
|
|
61
|
|
62
|
Querying the index
|
63
|
--------------------
|
64
|
The SOLR index can be queried using standard SOLR syntax and return options.
|
65
|
The DataONE query interface exposes the SOLR query engine.
|
66
|
|
67
|
http://mule1.dataone.org/ArchitectureDocs-current/apis/MN_APIs.html#MNQuery.query
|
68
|
|
69
|
Please see the SOLR documentation for examples and exhaustive syntax information.
|
70
|
|
71
|
http://lucene.apache.org/solr/
|
72
|
|
73
|
|
74
|
Access Policy enforcement
|
75
|
-------------------------
|
76
|
Access control is enforced by the index such that only records that are readable by the
|
77
|
user performing the query are returned to the user. Any SOLR query submitted will be
|
78
|
augmented with access control criteria corresponding to if and how the user is currently
|
79
|
authenticated. Both certificate-based (DataONE API) and JSESSIONID-based (Metacat API)
|
80
|
authentication are simultaneously supported.
|
81
|
|
82
|
|
83
|
Regenerating the index from scratch
|
84
|
-----------------------------------
|
85
|
When the SOLR index has been drastically modified, a complete regeneration of the
|
86
|
index may be necessary. In order to accomplish this:
|
87
|
|
88
|
Step-by-step instructions:
|
89
|
|
90
|
1. Entirely remove the solr-home directory
|
91
|
2. Step through the Metacat admin interface main properties screen, specifying the solr-home directory you wish to use
|
92
|
3. Restart the webapp container (Tomcat).
|
93
|
|
94
|
Content can also be submitted for index regeneration by using the the Metacat API:
|
95
|
|
96
|
1. Login as the Metacat administrator
|
97
|
2. Navigate to: <host>/<metacat_context>/metacat?action=reindex[&pid={pid}]
|
98
|
3. If the pid parameter is omitted, all objects in Metacat will be submitted for reindexing.
|
99
|
|
100
|
|
101
|
|
102
|
Class design overview
|
103
|
----------------------
|
104
|
|
105
|
.. figure:: images/indexing-class-diagram.png
|
106
|
|
107
|
Figure 1. Class design overview.
|
108
|
|
109
|
..
|
110
|
@startuml images/indexing-class-diagram.png
|
111
|
|
112
|
package "Current cn-index-processor (library)" {
|
113
|
|
114
|
interface IDocumentSubprocessor {
|
115
|
+ boolean canProcess(Document doc)
|
116
|
+ initExpression(XPath xpath)
|
117
|
+ Map<String, SolrDoc> processDocument(String identifier, Map<String, SolrDoc> docs, Document doc)
|
118
|
}
|
119
|
class AbstractDocumentSubprocessor {
|
120
|
- List<SolrField> fields
|
121
|
+ setMatchDocument(String matchDocument)
|
122
|
+ setFieldList(List<SolrField> fieldList)
|
123
|
}
|
124
|
class ResourceMapSubprocessor {
|
125
|
}
|
126
|
class ScienceMetadataDocumentSubprocessor {
|
127
|
}
|
128
|
|
129
|
interface ISolrField {
|
130
|
+ initExpression(XPath xpathObject)
|
131
|
+ List<SolrElementField> getFields(Document doc, String identifier)
|
132
|
}
|
133
|
class SolrField {
|
134
|
- String name
|
135
|
- String xpath
|
136
|
- boolean multivalue
|
137
|
}
|
138
|
class CommonRootSolrField {
|
139
|
}
|
140
|
class RootElement {
|
141
|
}
|
142
|
class LeafElement {
|
143
|
}
|
144
|
class FullTextSolrField {
|
145
|
}
|
146
|
class MergeSolrField {
|
147
|
}
|
148
|
class ResolveSolrField {
|
149
|
}
|
150
|
class SolrFieldResourceMap {
|
151
|
}
|
152
|
|
153
|
class SolrDoc {
|
154
|
- List<SolrElementField> fieldList
|
155
|
}
|
156
|
|
157
|
class SolrElementField {
|
158
|
- String name
|
159
|
- String value
|
160
|
}
|
161
|
|
162
|
}
|
163
|
|
164
|
IDocumentSubprocessor <|-- AbstractDocumentSubprocessor
|
165
|
AbstractDocumentSubprocessor <|-- ResourceMapSubprocessor
|
166
|
AbstractDocumentSubprocessor <|-- ScienceMetadataDocumentSubprocessor
|
167
|
|
168
|
ISolrField <|-- SolrField
|
169
|
SolrField <|-- CommonRootSolrField
|
170
|
CommonRootSolrField o--"1" RootElement
|
171
|
RootElement o--"*" LeafElement
|
172
|
SolrField <|-- FullTextSolrField
|
173
|
SolrField <|-- MergeSolrField
|
174
|
SolrField <|-- ResolveSolrField
|
175
|
SolrField <|-- SolrFieldResourceMap
|
176
|
|
177
|
AbstractDocumentSubprocessor o--"*" ISolrField
|
178
|
|
179
|
IDocumentSubprocessor --> SolrDoc
|
180
|
|
181
|
SolrDoc o--"*" SolrElementField
|
182
|
|
183
|
package "SOLR (library)" {
|
184
|
|
185
|
abstract class SolrServer {
|
186
|
+ add(SolrInputDocument doc)
|
187
|
+ deleteByQuery(String id)
|
188
|
+ query(SolrQuery query)
|
189
|
}
|
190
|
class EmbeddedSolrServer {
|
191
|
}
|
192
|
class HttpSolrServer {
|
193
|
}
|
194
|
|
195
|
}
|
196
|
|
197
|
SolrServer <|-- EmbeddedSolrServer
|
198
|
SolrServer <|-- HttpSolrServer
|
199
|
|
200
|
package "Metact-index (webapp)" {
|
201
|
|
202
|
class ApplicationController {
|
203
|
- List<SolrIndex> solrIndex
|
204
|
+ regenerateIndex()
|
205
|
}
|
206
|
|
207
|
class SolrIndex {
|
208
|
- List<IDocumentSubprocessor> subprocessors
|
209
|
- SolrServer solrServer
|
210
|
+ insert(String pid, InputStream data)
|
211
|
+ update(String pid, InputStream data)
|
212
|
+ remove(String pid)
|
213
|
}
|
214
|
|
215
|
class SystemMetadataEventListener {
|
216
|
- SolrIndex solrIndex
|
217
|
+ itemAdded(ItemEvent<SystemMetadata>)
|
218
|
+ itemRemoved(ItemEvent<SystemMetadata>)
|
219
|
}
|
220
|
|
221
|
}
|
222
|
|
223
|
package "Metacat (webapp)" {
|
224
|
|
225
|
class MetacatSolrIndex {
|
226
|
- SolrServer solrServer
|
227
|
+ InputStream query(SolrQuery)
|
228
|
}
|
229
|
|
230
|
class HazelcastService {
|
231
|
- IMap hzIndexQueue
|
232
|
- IMap hzSystemMetadata
|
233
|
- IMap hzObjectPath
|
234
|
}
|
235
|
|
236
|
}
|
237
|
|
238
|
MetacatSolrIndex o--"1" SolrServer
|
239
|
HazelcastService .. SystemMetadataEventListener
|
240
|
|
241
|
ApplicationController o--"*" SolrIndex
|
242
|
SolrIndex o--"1" SolrServer
|
243
|
SolrIndex "1"--o SystemMetadataEventListener
|
244
|
SolrIndex o--"*" IDocumentSubprocessor: Assembled using Spring bean configuration
|
245
|
|
246
|
|
247
|
|
248
|
|
249
|
@enduml
|