1
|
<!--
|
2
|
'$RCSfile$'
|
3
|
Copyright: 2000 Regents of the University of California and the
|
4
|
National Center for Ecological Analysis and Synthesis
|
5
|
For Details: http://knb.ecoinformatics.org/
|
6
|
|
7
|
'$Author: jones $'
|
8
|
'$Date: 2001-10-22 09:46:55 -0700 (Mon, 22 Oct 2001) $'
|
9
|
'$Revision: 853 $'
|
10
|
|
11
|
This program is free software; you can redistribute it and/or modify
|
12
|
it under the terms of the GNU General Public License as published by
|
13
|
the Free Software Foundation; either version 2 of the License, or
|
14
|
(at your option) any later version.
|
15
|
|
16
|
This program is distributed in the hope that it will be useful,
|
17
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
GNU General Public License for more details.
|
20
|
|
21
|
You should have received a copy of the GNU General Public License
|
22
|
along with this program; if not, write to the Free Software
|
23
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
24
|
-->
|
25
|
<!-- EML DTD document that defines the structural
|
26
|
characteristics of physical objects -->
|
27
|
<!-- The root element, which contains an identifier and
|
28
|
an physical description -->
|
29
|
<!ELEMENT eml-physical (identifier, format, characterEncoding?, size?, authentication*, compressionMethod?,
|
30
|
encodingMethod?, numHeaderLines?, recordDelimiter?, maxRecordLength?,
|
31
|
quoteCharacter*, literalCharacter*, (fieldStartColumn?, (fieldDelimiter | fieldWidth))* )>
|
32
|
<!-- File identifier of the metadata document -->
|
33
|
<!ELEMENT identifier (#PCDATA)>
|
34
|
<!ATTLIST identifier
|
35
|
system CDATA #IMPLIED
|
36
|
>
|
37
|
<!-- size -->
|
38
|
<!ELEMENT size (#PCDATA)>
|
39
|
<!ATTLIST size
|
40
|
unit CDATA "bytes"
|
41
|
>
|
42
|
<!-- Authentication value and method -->
|
43
|
<!ELEMENT authentication (#PCDATA)>
|
44
|
<!ATTLIST authentication
|
45
|
method CDATA #IMPLIED
|
46
|
>
|
47
|
<!-- Entity format (e.g., text, name of various binary formats [TIFF]) -->
|
48
|
<!ELEMENT format (#PCDATA)>
|
49
|
<!-- characterEncoding fro text files (e.g., ASCII, UTF-8) -->
|
50
|
<!ELEMENT characterEncoding (#PCDATA)>
|
51
|
<!-- Method of compression -->
|
52
|
<!ELEMENT compressionMethod (#PCDATA)>
|
53
|
<!-- Method of encoding -->
|
54
|
<!ELEMENT encodingMethod (#PCDATA)>
|
55
|
<!-- The character used to delimit records in the entity -->
|
56
|
<!ELEMENT recordDelimiter (#PCDATA)>
|
57
|
<!ELEMENT maxRecordLength (#PCDATA)>
|
58
|
<!-- The character used to delimit quote data values so that the
|
59
|
filed delimeters can be used in the data value, typically
|
60
|
" or ' -->
|
61
|
<!ELEMENT quoteCharacter (#PCDATA)>
|
62
|
<!-- The character used to escape special characters
|
63
|
so that they are interpreted literally, usually \ -->
|
64
|
<!ELEMENT literalCharacter (#PCDATA)>
|
65
|
<!-- Number of header lines or information that prepares data -->
|
66
|
<!ELEMENT numHeaderLines (#PCDATA)>
|
67
|
|
68
|
<!--
|
69
|
Variable width format fields (attributes) can vary in their
|
70
|
field length, thus the end of the field is
|
71
|
delimited by a special character called a
|
72
|
field delimiter (typically a comma or a space).
|
73
|
|
74
|
Data sets are generally classified as fixedWidth
|
75
|
format or variableWidth format, but we have
|
76
|
determined that this is actually a per-field
|
77
|
classification because one may encounter
|
78
|
fixedWidth fields mixed together in the same
|
79
|
data file with variableWidth fields.
|
80
|
|
81
|
In our encoding scheme, the start of each field
|
82
|
is assumed to be the column after the last column
|
83
|
of the previous field, or the first column
|
84
|
if this is the first field in the dataset, unless
|
85
|
the starting column is explicity enumerated using the
|
86
|
"fieldStartColumn" element.
|
87
|
The end column for each field is classified
|
88
|
using either a special character delimeter indicated
|
89
|
using the filedDelimiter element,
|
90
|
or a fixed field length indicated by using the "fieldWidth"
|
91
|
element. The delimiter for the last field in the data set can be omitted.
|
92
|
variableWidth fields can vary in their field length, and the end of
|
93
|
the field is delimited by a special character
|
94
|
called a field delimiter, usually a comma or
|
95
|
a tab character. fixedWidth fields have a set
|
96
|
length, and so the end of the field can always
|
97
|
be determined by adding the fieldWidth to the
|
98
|
starting column number. Here is an example:
|
99
|
|
100
|
Assume we have the following data in a data set:
|
101
|
|
102
|
May,100aaaa,1.2,
|
103
|
April,200aaaa,3.4,
|
104
|
June,300bbbb,4.6,
|
105
|
|
106
|
The metadata indicating the physical layout of the 4 fields would include the
|
107
|
following:
|
108
|
|
109
|
<delimiter>,</delimiter>
|
110
|
<fieldWidth>3</fieldWidth>
|
111
|
<fieldWidth>3</fieldWidth>
|
112
|
<delimiter>,</delimiter>
|
113
|
|
114
|
In a strictly fixed format file, the metadata would be slightly different:
|
115
|
|
116
|
May100aaaa1.2
|
117
|
Apr200aaaa3.4
|
118
|
Jun300bbbb4.6
|
119
|
|
120
|
<fieldWidth>3</fieldWidth>
|
121
|
<fieldWidth>3</fieldWidth>
|
122
|
<fieldWidth>4</fieldWidth>
|
123
|
<fieldWidth>3</fieldWidth>
|
124
|
|
125
|
or, one could explicitly describe the starting columns:
|
126
|
|
127
|
<fieldStartColumn>1</fieldStartColumn>
|
128
|
<fieldWidth>3</fieldWidth>
|
129
|
<fieldStartColumn>4</fieldStartColumn>
|
130
|
<fieldWidth>3</fieldWidth>
|
131
|
<fieldStartColumn>7</fieldStartColumn>
|
132
|
<fieldWidth>4</fieldWidth>
|
133
|
<fieldStartColumn>11</fieldStartColumn>
|
134
|
<fieldWidth>3</fieldWidth>
|
135
|
-->
|
136
|
<!ELEMENT fieldStartColumn (#PCDATA)>
|
137
|
<!ELEMENT fieldDelimiter (#PCDATA)>
|
138
|
<!ELEMENT fieldWidth (#PCDATA)>
|
139
|
<!-- End of file -->
|