Coverage report: /home/ellis/comp/core/lib/dat/parquet/thrift.lisp
Kind | Covered | All | % |
expression | 0 | 77 | 0.0 |
branch | 0 | 0 | nil |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; /home/ellis/comp/core/lisp/lib/dat/parquet/thrift.lisp --- Parquet Thrift Definitions -*- buffer-read-only:t -*-
3
;; input = /home/ellis/comp/core/.stash/parquet.json
5
;; This file was generated automatically by
6
;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS
11
(in-package :dat/parquet)
13
(defvar *parquet-json-types*
14
'(:boolean :int32 :int64 :int96 :float :double :byte-array
15
:fixed-len-byte-array))
16
(defvar *parquet-json-converted-types*
17
'(:utf8 :map :map-key-value :list :enum :decimal :date :time-millis
18
:time-micros :timestamp-millis :timestamp-micros :uint-8 :uint-16 :uint-32
19
:uint-64 :int-8 :int-16 :int-32 :int-64 :json :bson :interval))
20
(defvar *parquet-json-field-repetition-types* '(:required :optional :repeated))
21
(defvar *parquet-json-encodings*
22
'(:plain :plain-dictionary :rle :bit-packed :delta-binary-packed
23
:delta-length-byte-array :delta-byte-array :rle-dictionary
25
(defvar *parquet-json-compression-codecs*
26
'(:uncompressed :snappy :gzip :lzo :brotli :lz4 :zstd :lz4-raw))
27
(defvar *parquet-json-page-types*
28
'(:data-page :index-page :dictionary-page :data-page-v2))
29
(defvar *parquet-json-boundary-orders* '(:unordered :ascending :descending))
30
(deftype parquet-boolean () 'boolean)
31
(deftype parquet-int32 () '(signed-byte 32))
32
(deftype parquet-int64 () '(signed-byte 64))
33
(deftype parquet-int96 () '(signed-byte 96))
34
(deftype parquet-float () 'float)
35
(deftype parquet-double () 'double-float)
36
(deftype parquet-byte-array (&optional dat/parquet/gen::size)
37
`(octet-vector ,dat/parquet/gen::size))
38
(deftype parquet-fixed-len-byte-array (dat/parquet/gen::size)
39
`(octet-vector ,dat/parquet/gen::size))
40
(defclass parquet-size-statistics (parquet-object)
41
((unencoded-byte-array-data-bytes :documentation
42
"The number of physical bytes stored for BYTE_ARRAY data values assuming
43
no encoding. This is exclusive of the bytes needed to store the length of
44
each byte array. In other words, this field is equivalent to the `(size
45
of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
46
written)`. To determine unencoded sizes of other types readers can use
47
schema information multiplied by the number of non-null and null values.
48
The number of null\\non-null values can be inferred from the histograms
51
For example, if a column chunk is dictionary-encoded with dictionary
52
[\\a\\, \\bc\\, \\cde\\], and a data page contains the indices [0, 0, 1, 2],
53
then this value for that data page should be 7 (1 + 1 + 2 + 3).
55
This field should only be set for types that use BYTE_ARRAY as their
58
:initarg :unencoded-byte-array-data-bytes :initform nil :type
59
(or null (signed-byte 64)))
60
(repetition-level-histogram :documentation
61
"When present, there is expected to be one element corresponding to each
62
repetition (i.e. size=max repetition_level+1) where each element
63
represents the number of times the repetition level was observed in the
66
This field may be omitted if max_repetition_level is 0 without loss
70
:initarg :repetition-level-histogram :initform nil :type
71
(or null (vector (signed-byte 64))))
72
(definition-level-histogram :documentation
73
"Same as repetition_level_histogram except for definition levels.
75
This field may be omitted if max_definition_level is 0 or 1 without
79
:initarg :definition-level-histogram :initform nil :type
80
(or null (vector (signed-byte 64)))))
82
"A structure for capturing metadata for estimating the unencoded,
83
uncompressed size of data written. This is useful for readers to estimate
84
how much memory is needed to reconstruct data in their memory model and for
85
fine grained filter pushdown on nested structures (the histograms contained
86
in this structure can help determine the number of nulls at a particular
87
nesting level and maximum length of lists).
89
(defclass parquet-statistics (parquet-object)
91
"DEPRECATED: min and max value of the column. Use min_value and max_value.
93
Values are encoded using PLAIN encoding, except that variable-length byte
94
arrays do not include a length prefix.
96
These fields encode min and max values determined by signed comparison
97
only. New files should use the correct order for a column's logical type
98
and store the values in the min_value and max_value fields.
100
To support older readers, these may be set when the column order is
103
:initarg :max :initform nil :type (or null octet-vector))
104
(min :initarg :min :initform nil :type (or null octet-vector))
105
(null-count :documentation "count of null value in the column
107
:initarg :null-count :initform nil :type
108
(or null (signed-byte 64)))
109
(distinct-count :documentation "count of distinct values occurring
111
:initarg :distinct-count :initform nil :type
112
(or null (signed-byte 64)))
113
(max-value :documentation
114
"Lower and upper bound values for the column, determined by its ColumnOrder.
116
These may be the actual minimum and maximum values found on a page or column
117
chunk, but can also be (more compact) values that do not exist on a page or
118
column chunk. For example, instead of storing \\Blart Versenwald III\\, a writer
119
may set min_value=\\B\\, max_value=\\C\\. Such more compact values must still be
120
valid values within the column's logical type.
122
Values are encoded using PLAIN encoding, except that variable-length byte
123
arrays do not include a length prefix.
125
:initarg :max-value :initform nil :type (or null octet-vector))
126
(min-value :initarg :min-value :initform nil :type
127
(or null octet-vector))
128
(is-max-value-exact :documentation
129
"If true, max_value is the actual maximum value for a column
131
:initarg :is-max-value-exact :initform nil :type (or null boolean))
132
(is-min-value-exact :documentation
133
"If true, min_value is the actual minimum value for a column
135
:initarg :is-min-value-exact :initform nil :type
137
(:documentation "Statistics per row group and per page
138
All fields are optional.
140
(defclass parquet-string-type (parquet-object) nil
141
(:documentation "Empty structs to use as logical type annotations
143
(defclass parquet-uuid-type (parquet-object) nil)
144
(defclass parquet-map-type (parquet-object) nil)
145
(defclass parquet-list-type (parquet-object) nil)
146
(defclass parquet-enum-type (parquet-object) nil)
147
(defclass parquet-date-type (parquet-object) nil)
148
(defclass parquet-float16-type (parquet-object) nil)
149
(defclass parquet-null-type (parquet-object) nil
151
"Logical type to annotate a column that is always null.
153
Sometimes when discovering the schema of existing data, values are always
154
null and the physical type can't be determined. This annotation signals
155
the case where the physical type was guessed from all null values.
157
(defclass parquet-decimal-type (parquet-object)
158
((scale :initarg :scale :type (signed-byte 32))
159
(precision :initarg :precision :type (signed-byte 32)))
160
(:documentation "Decimal logical type annotation
162
Scale must be zero or a positive integer less than or equal to the precision.
163
Precision must be a non-zero positive integer.
165
To maintain forward-compatibility in v1, implementations using this logical
166
type must also set scale and precision on the annotated SchemaElement.
168
Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
170
(defclass parquet-milli-seconds (parquet-object) nil
171
(:documentation "Time units for logical types
173
(defclass parquet-micro-seconds (parquet-object) nil)
174
(defclass parquet-nano-seconds (parquet-object) nil)
175
(defclass parquet-time-unit (parquet-object)
176
((millis :initarg :millis :initform nil :type
177
(or null parquet-milli-seconds))
178
(micros :initarg :micros :initform nil :type
179
(or null parquet-micro-seconds))
180
(nanos :initarg :nanos :initform nil :type
181
(or null parquet-nano-seconds))))
182
(defclass parquet-timestamp-type (parquet-object)
183
((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
184
(unit :initarg :unit :type parquet-time-unit))
185
(:documentation "Timestamp logical type annotation
187
Allowed for physical types: INT64
189
(defclass parquet-time-type (parquet-object)
190
((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
191
(unit :initarg :unit :type parquet-time-unit))
192
(:documentation "Time logical type annotation
194
Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
196
(defclass parquet-int-type (parquet-object)
197
((bitwidth :initarg :bitwidth)
198
(issigned :initarg :issigned :type boolean))
199
(:documentation "Integer logical type annotation
201
bitWidth must be 8, 16, 32, or 64.
203
Allowed for physical types: INT32, INT64
205
(defclass parquet-json-type (parquet-object) nil
206
(:documentation "Embedded JSON logical type annotation
208
Allowed for physical types: BYTE_ARRAY
210
(defclass parquet-bson-type (parquet-object) nil
211
(:documentation "Embedded BSON logical type annotation
213
Allowed for physical types: BYTE_ARRAY
215
(defclass parquet-logical-type (parquet-object)
216
((string :initarg :string :initform nil :type
217
(or null parquet-string-type))
218
(map :initarg :map :initform nil :type (or null parquet-map-type))
219
(list :initarg :list :initform nil :type
220
(or null parquet-list-type))
221
(enum :initarg :enum :initform nil :type
222
(or null parquet-enum-type))
223
(decimal :initarg :decimal :initform nil :type
224
(or null parquet-decimal-type))
225
(date :initarg :date :initform nil :type
226
(or null parquet-date-type))
233
(or null parquet-time-type))
234
(timestamp :initarg :timestamp :initform nil :type
235
(or null parquet-timestamp-type))
236
(integer :initarg :integer :initform nil :type
237
(or null parquet-int-type))
238
(unknown :initarg :unknown :initform nil :type
239
(or null parquet-null-type))
240
(json :initarg :json :initform nil :type
241
(or null parquet-json-type))
242
(bson :initarg :bson :initform nil :type
243
(or null parquet-bson-type))
244
(uuid :initarg :uuid :initform nil :type
245
(or null parquet-uuid-type))
246
(float16 :initarg :float16 :initform nil :type
247
(or null parquet-float16-type)))
248
(:documentation "LogicalType annotations to replace ConvertedType.
250
To maintain compatibility, implementations using LogicalType for a
251
SchemaElement must also set the corresponding ConvertedType (if any)
252
from the following table.
254
(defclass parquet-schema-element (parquet-object)
255
((type :documentation
256
"Data type for this field. Not set if the current element is a non-leaf node
258
:initarg :type :initform nil :type (or null parquet-type))
259
(type-length :documentation
260
"If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
261
Otherwise, if specified, this is the maximum bit length to store any of the values.
262
(e.g. a low cardinality INT col could have this set to 3). Note that this is
263
in the schema, and therefore fixed for the entire file.
265
:initarg :type-length :initform nil :type
266
(or null (signed-byte 32)))
267
(repetition-type :documentation
268
"repetition of the field. The root of the schema does not have a repetition_type.
269
All other nodes must have one
271
:initarg :repetition-type :initform nil :type
272
(or null parquet-field-repetition-type))
273
(name :documentation "Name of the field in the schema
275
:initarg :name :type string)
276
(num-children :documentation
277
"Nested fields. Since thrift does not support nested fields,
278
the nesting is flattened to a single list by a depth-first traversal.
279
The children count is used to construct the nested relationship.
280
This field is not set when the element is a primitive type
282
:initarg :num-children :initform nil :type
283
(or null (signed-byte 32)))
284
(converted-type :documentation
285
"DEPRECATED: When the schema is the result of a conversion from another model.
286
Used to record the original type to help with cross conversion.
288
This is superseded by logicalType.
290
:initarg :converted-type :initform nil :type
291
(or null parquet-converted-type))
292
(scale :documentation
293
"DEPRECATED: Used when this column contains decimal data.
294
See the DECIMAL converted type for more details.
296
This is superseded by using the DecimalType annotation in logicalType.
298
:initarg :scale :initform nil :type (or null (signed-byte 32)))
299
(precision :initarg :precision :initform nil :type
300
(or null (signed-byte 32)))
301
(field-id :documentation
302
"When the original schema supports field ids, this will save the
303
original field id in the parquet schema
305
:initarg :field-id :initform nil :type (or null (signed-byte 32)))
306
(logicaltype :documentation "The logical type of this SchemaElement
308
LogicalType replaces ConvertedType, but ConvertedType is still required
309
for some logical types to ensure forward-compatibility in format v1.
311
:initarg :logicaltype :initform nil :type
312
(or null parquet-logical-type)))
313
(:documentation "Represents a element inside a schema definition.
314
- if it is a group (inner node) then type is undefined and num_children is defined
315
- if it is a primitive type (leaf) then type is defined and num_children is undefined
316
the nodes are listed in depth first traversal order.
318
(defclass parquet-data-page-header (parquet-object)
319
((num-values :documentation
320
"Number of values, including NULLs, in this data page.
322
If a OffsetIndex is present, a page must begin at a row
323
boundary (repetition_level = 0). Otherwise, pages may begin
324
within a row (repetition_level > 0).
327
:initarg :num-values :type (signed-byte 32))
328
(encoding :documentation "Encoding used for this data page *
330
:initarg :encoding :type parquet-encoding)
331
(definition-level-encoding :documentation
332
"Encoding used for definition levels *
334
:initarg :definition-level-encoding :type parquet-encoding)
335
(repetition-level-encoding :documentation
336
"Encoding used for repetition levels *
338
:initarg :repetition-level-encoding :type parquet-encoding)
339
(statistics :documentation
340
"Optional statistics for the data in this page *
342
:initarg :statistics :initform nil :type
343
(or null parquet-statistics)))
344
(:documentation "Data page header
346
(defclass parquet-index-page-header (parquet-object) nil)
347
(defclass parquet-dictionary-page-header (parquet-object)
348
((num-values :documentation "Number of values in the dictionary *
350
:initarg :num-values :type (signed-byte 32))
351
(encoding :documentation "Encoding using this dictionary page *
353
:initarg :encoding :type parquet-encoding)
354
(is-sorted :documentation
355
"If true, the entries in the dictionary are sorted in ascending order *
357
:initarg :is-sorted :initform nil :type (or null boolean)))
359
"The dictionary page must be placed at the first position of the column chunk
360
if it is partly or completely dictionary encoded. At most one dictionary page
361
can be placed in a column chunk.
364
(defclass parquet-data-page-header-v2 (parquet-object)
365
((num-values :documentation
366
"Number of values, including NULLs, in this data page. *
368
:initarg :num-values :type (signed-byte 32))
369
(num-nulls :documentation "Number of NULL values, in this data page.
370
Number of non-null = num_values - num_nulls which is also the number of values in the data section *
372
:initarg :num-nulls :type (signed-byte 32))
373
(num-rows :documentation
374
"Number of rows in this data page. Every page must begin at a
375
row boundary (repetition_level = 0): rows must **not** be
376
split across page boundaries when using V2 data pages.
379
:initarg :num-rows :type (signed-byte 32))
380
(encoding :documentation "Encoding used for data in this page *
382
:initarg :encoding :type parquet-encoding)
383
(definition-levels-byte-length :documentation
384
"Length of the definition levels
386
:initarg :definition-levels-byte-length :type (signed-byte 32))
387
(repetition-levels-byte-length :documentation
388
"Length of the repetition levels
390
:initarg :repetition-levels-byte-length :type (signed-byte 32))
391
(is-compressed :documentation "Whether the values are compressed.
392
Which means the section of the page between
393
definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
394
is compressed with the compression_codec.
395
If missing it is considered compressed
397
:initarg :is-compressed :initform nil :type (or null boolean))
398
(statistics :documentation
399
"Optional statistics for the data in this page *
401
:initarg :statistics :initform nil :type
402
(or null parquet-statistics)))
404
"New page format allowing reading levels without decompressing the data
405
Repetition and definition levels are uncompressed
406
The remaining section containing the data is compressed if is_compressed is true
409
(defclass parquet-split-block-algorithm (parquet-object) nil
410
(:documentation "Block-based algorithm type annotation. *
412
(defclass parquet-bloom-filter-algorithm (parquet-object)
413
((block :documentation
414
"Block-based Bloom filter. *
421
(or null parquet-split-block-algorithm)))
422
(:documentation "The algorithm used in Bloom filter. *
424
(defclass parquet-xx-hash (parquet-object) nil
426
"Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash
427
algorithm. It uses 64 bits version of xxHash.
430
(defclass parquet-bloom-filter-hash (parquet-object)
431
((xxhash :documentation "xxHash Strategy. *
433
:initarg :xxhash :initform nil :type (or null parquet-xx-hash)))
435
"The hash function used in Bloom filter. This function takes the hash of a column value
436
using plain encoding.
439
(defclass parquet-uncompressed (parquet-object) nil
440
(:documentation "The compression used in the Bloom filter.
443
(defclass parquet-bloom-filter-compression (parquet-object)
444
((uncompressed :initarg :uncompressed :initform nil :type
445
(or null parquet-uncompressed))))
446
(defclass parquet-bloom-filter-header (parquet-object)
447
((numbytes :documentation "The size of bitset in bytes *
449
:initarg :numbytes :type (signed-byte 32))
450
(algorithm :documentation "The algorithm for setting bits. *
452
:initarg :algorithm :type parquet-bloom-filter-algorithm)
453
(hash :documentation "The hash function used for Bloom filter. *
455
:initarg :hash :type parquet-bloom-filter-hash)
456
(compression :documentation
457
"The compression used in the Bloom filter *
459
:initarg :compression :type parquet-bloom-filter-compression))
461
"Bloom filter header is stored at beginning of Bloom filter data of each column
462
and followed by its bitset.
465
(defclass parquet-page-header (parquet-object)
466
((type :documentation
467
"the type of the page: indicates which of the *_header fields is set *
469
:initarg :type :type parquet-page-type)
470
(uncompressed-page-size :documentation
471
"Uncompressed page size in bytes (not including this header) *
473
:initarg :uncompressed-page-size :type (signed-byte 32))
474
(compressed-page-size :documentation
475
"Compressed (and potentially encrypted) page size in bytes, not including this header *
477
:initarg :compressed-page-size :type (signed-byte 32))
479
"The 32-bit CRC checksum for the page, to be be calculated as follows:
481
- The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
482
the same as in e.g. GZip).
483
- All page types can have a CRC (v1 and v2 data pages, dictionary pages,
485
- The CRC is computed on the serialization binary representation of the page
486
(as written to disk), excluding the page header. For example, for v1
487
data pages, the CRC is computed on the concatenation of repetition levels,
488
definition levels and column values (optionally compressed, optionally
490
- The CRC computation therefore takes place after any compression
491
and encryption steps, if any.
493
If enabled, this allows for disabling checksumming in HDFS if only a few
494
pages need to be read.
496
:initarg :crc :initform nil :type (or null (signed-byte 32)))
497
(data-page-header :initarg :data-page-header :initform nil :type
498
(or null parquet-data-page-header))
499
(index-page-header :initarg :index-page-header :initform nil :type
500
(or null parquet-index-page-header))
501
(dictionary-page-header :initarg :dictionary-page-header :initform
502
nil :type (or null parquet-dictionary-page-header))
503
(data-page-header-v2 :initarg :data-page-header-v2 :initform nil
504
:type (or null parquet-data-page-header-v2))))
505
(defclass parquet-key-value (parquet-object)
506
((key :initarg :key :type string)
507
(value :initarg :value :initform nil :type (or null string)))
508
(:documentation "Wrapper struct to store key values
510
(defclass parquet-sorting-column (parquet-object)
511
((column-idx :documentation
512
"The ordinal position of the column (in this row group) *
514
:initarg :column-idx :type (signed-byte 32))
515
(descending :documentation
516
"If true, indicates this column is sorted in descending order. *
518
:initarg :descending :type boolean)
519
(nulls-first :documentation
520
"If true, nulls will come before non-null values, otherwise,
523
:initarg :nulls-first :type boolean))
524
(:documentation "Sort order within a RowGroup of a leaf column
526
(defclass parquet-page-encoding-stats (parquet-object)
527
((page-type :documentation "the page type (data\\dic\\...) *
529
:initarg :page-type :type parquet-page-type)
530
(encoding :documentation "encoding of the page *
532
:initarg :encoding :type parquet-encoding)
533
(count :documentation
534
"number of pages of this type with this encoding *
536
:initarg :count :type (signed-byte 32)))
537
(:documentation "statistics of a given page type and encoding
539
(defclass parquet-column-meta-data (parquet-object)
540
((type :documentation "Type of this column *
542
:initarg :type :type parquet-type)
543
(encodings :documentation
544
"Set of all encodings used for this column. The purpose is to validate
545
whether we can decode those pages. *
547
:initarg :encodings :type (vector parquet-encoding))
548
(path-in-schema :documentation "Path in schema *
550
:initarg :path-in-schema :type (vector string))
551
(codec :documentation "Compression codec *
553
:initarg :codec :type parquet-compression-codec)
554
(num-values :documentation "Number of values in this column *
556
:initarg :num-values :type (signed-byte 64))
557
(total-uncompressed-size :documentation
558
"total byte size of all uncompressed pages in this column chunk (including the headers) *
560
:initarg :total-uncompressed-size :type (signed-byte 64))
561
(total-compressed-size :documentation
562
"total byte size of all compressed, and potentially encrypted, pages
563
in this column chunk (including the headers) *
565
:initarg :total-compressed-size :type (signed-byte 64))
566
(key-value-metadata :documentation "Optional key\\value metadata *
568
:initarg :key-value-metadata :initform nil :type
569
(or null (vector parquet-key-value)))
570
(data-page-offset :documentation
571
"Byte offset from beginning of file to first data page *
573
:initarg :data-page-offset :type (signed-byte 64))
574
(index-page-offset :documentation
575
"Byte offset from beginning of file to root index page *
577
:initarg :index-page-offset :initform nil :type
578
(or null (signed-byte 64)))
579
(dictionary-page-offset :documentation
580
"Byte offset from the beginning of file to first (only) dictionary page *
582
:initarg :dictionary-page-offset :initform nil :type
583
(or null (signed-byte 64)))
584
(statistics :documentation "optional statistics for this column chunk
586
:initarg :statistics :initform nil :type
587
(or null parquet-statistics))
588
(encoding-stats :documentation
589
"Set of all encodings used for pages in this column chunk.
590
This information can be used to determine if all data pages are
591
dictionary encoded for example *
593
:initarg :encoding-stats :initform nil :type
594
(or null (vector parquet-page-encoding-stats)))
595
(bloom-filter-offset :documentation
596
"Byte offset from beginning of file to Bloom filter data. *
598
:initarg :bloom-filter-offset :initform nil :type
599
(or null (signed-byte 64)))
600
(bloom-filter-length :documentation
601
"Size of Bloom filter data including the serialized header, in bytes.
602
Added in 2.10 so readers may not read this field from old files and
603
it can be obtained after the BloomFilterHeader has been deserialized.
604
Writers should write this field so readers can read the bloom filter
607
:initarg :bloom-filter-length :initform nil :type
608
(or null (signed-byte 32)))
609
(size-statistics :documentation
610
"Optional statistics to help estimate total memory when converted to in-memory
611
representations. The histograms contained in these statistics can
612
also be useful in some cases for more fine-grained nullability\\list length
615
:initarg :size-statistics :initform nil :type
616
(or null parquet-size-statistics)))
617
(:documentation "Description for column metadata
619
(defclass parquet-encryption-with-footer-key (parquet-object) nil)
620
(defclass parquet-encryption-with-column-key (parquet-object)
621
((path-in-schema :documentation "Column path in schema *
623
:initarg :path-in-schema :type (vector string))
624
(key-metadata :documentation
625
"Retrieval metadata of column encryption key *
627
:initarg :key-metadata :initform nil :type (or null octet-vector))))
628
(defclass parquet-column-crypto-meta-data (parquet-object)
629
((encryption-with-footer-key :initarg :encryption-with-footer-key
630
:initform nil :type (or null parquet-encryption-with-footer-key))
631
(encryption-with-column-key :initarg :encryption-with-column-key
632
:initform nil :type (or null parquet-encryption-with-column-key))))
633
(defclass parquet-column-chunk (parquet-object)
634
((file-path :documentation
635
"File where column data is stored. If not set, assumed to be same file as
636
metadata. This path is relative to the current file.
639
:initarg :file-path :initform nil :type (or null string))
640
(file-offset :documentation
641
"Deprecated: Byte offset in file_path to the ColumnMetaData
643
Past use of this field has been inconsistent, with some implementations
644
using it to point to the ColumnMetaData and some using it to point to
645
the first page in the column chunk. In many cases, the ColumnMetaData at this
646
location is wrong. This field is now deprecated and should not be used.
647
Writers should set this field to 0 if no ColumnMetaData has been written outside
650
:initarg :file-offset :type (signed-byte 64))
651
(meta-data :documentation
652
"Column metadata for this chunk. Some writers may also replicate this at the
653
location pointed to by file_path\\file_offset.
654
Note: while marked as optional, this field is in fact required by most major
655
Parquet implementations. As such, writers MUST populate this field.
658
:initarg :meta-data :initform nil :type
659
(or null parquet-column-meta-data))
660
(offset-index-offset :documentation
661
"File offset of ColumnChunk's OffsetIndex *
663
:initarg :offset-index-offset :initform nil :type
664
(or null (signed-byte 64)))
665
(offset-index-length :documentation
666
"Size of ColumnChunk's OffsetIndex, in bytes *
668
:initarg :offset-index-length :initform nil :type
669
(or null (signed-byte 32)))
670
(column-index-offset :documentation
671
"File offset of ColumnChunk's ColumnIndex *
673
:initarg :column-index-offset :initform nil :type
674
(or null (signed-byte 64)))
675
(column-index-length :documentation
676
"Size of ColumnChunk's ColumnIndex, in bytes *
678
:initarg :column-index-length :initform nil :type
679
(or null (signed-byte 32)))
680
(crypto-metadata :documentation
681
"Crypto metadata of encrypted columns *
683
:initarg :crypto-metadata :initform nil :type
684
(or null parquet-column-crypto-meta-data))
685
(encrypted-column-metadata :documentation
686
"Encrypted column metadata for this chunk *
688
:initarg :encrypted-column-metadata :initform nil :type
689
(or null octet-vector))))
690
(defclass parquet-row-group (parquet-object)
691
((columns :documentation
692
"Metadata for each column chunk in this row group.
693
This list must have the same order as the SchemaElement list in FileMetaData.
696
:initarg :columns :type (vector parquet-column-chunk))
697
(total-byte-size :documentation
698
"Total byte size of all the uncompressed column data in this row group *
700
:initarg :total-byte-size :type (signed-byte 64))
701
(num-rows :documentation "Number of rows in this row group *
703
:initarg :num-rows :type (signed-byte 64))
704
(sorting-columns :documentation
705
"If set, specifies a sort ordering of the rows in this RowGroup.
706
The sorting columns can be a subset of all the columns.
708
:initarg :sorting-columns :initform nil :type
709
(or null (vector parquet-sorting-column)))
710
(file-offset :documentation
711
"Byte offset from beginning of file to first page (data or dictionary)
714
:initarg :file-offset :initform nil :type
715
(or null (signed-byte 64)))
716
(total-compressed-size :documentation
717
"Total byte size of all compressed (and potentially encrypted) column data
720
:initarg :total-compressed-size :initform nil :type
721
(or null (signed-byte 64)))
722
(ordinal :documentation "Row group ordinal in the file *
724
:initarg :ordinal :initform nil :type (or null (signed-byte 16)))))
725
(defclass parquet-type-defined-order (parquet-object) nil
727
"Empty struct to signal the order defined by the physical or logical type
729
(defclass parquet-column-order (parquet-object)
730
((type-order :documentation "The sort orders for logical types are:
731
UTF8 - unsigned byte-wise comparison
732
INT8 - signed comparison
733
INT16 - signed comparison
734
INT32 - signed comparison
735
INT64 - signed comparison
736
UINT8 - unsigned comparison
737
UINT16 - unsigned comparison
738
UINT32 - unsigned comparison
739
UINT64 - unsigned comparison
740
DECIMAL - signed comparison of the represented value
741
DATE - signed comparison
742
TIME_MILLIS - signed comparison
743
TIME_MICROS - signed comparison
744
TIMESTAMP_MILLIS - signed comparison
745
TIMESTAMP_MICROS - signed comparison
747
JSON - unsigned byte-wise comparison
748
BSON - unsigned byte-wise comparison
749
ENUM - unsigned byte-wise comparison
753
In the absence of logical types, the sort order is determined by the physical type:
754
BOOLEAN - false, true
755
INT32 - signed comparison
756
INT64 - signed comparison
757
INT96 (only used for legacy timestamps) - undefined
758
FLOAT - signed comparison of the represented value (*)
759
DOUBLE - signed comparison of the represented value (*)
760
BYTE_ARRAY - unsigned byte-wise comparison
761
FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
763
(*) Because the sorting order is not specified properly for floating
764
point values (relations vs. total ordering) the following
765
compatibility rules should be applied when reading statistics:
766
- If the min is a NaN, it should be ignored.
767
- If the max is a NaN, it should be ignored.
768
- If the min is +0, the row group may contain -0 values as well.
769
- If the max is -0, the row group may contain +0 values as well.
770
- When looking for NaN values, min and max should be ignored.
772
When writing statistics the following rules should be followed:
773
- NaNs should not be written to min or max statistics fields.
774
- If the computed max value is zero (whether negative or positive),
775
`+0.0` should be written into the max statistics field.
776
- If the computed min value is zero (whether negative or positive),
777
`-0.0` should be written into the min statistics field.
779
:initarg :type-order :initform nil :type
780
(or null parquet-type-defined-order)))
782
"Union to specify the order used for the min_value and max_value fields for a
783
column. This union takes the role of an enhanced enum that allows rich
784
elements (which will be needed for a collation-based ordering in the future).
787
* TypeDefinedOrder - the column uses the order defined by its logical or
788
physical type (if there is no logical type).
790
If the reader does not support the value of this union, min and max stats
791
for this column should be ignored.
793
(defclass parquet-page-location (parquet-object)
794
((offset :documentation "Offset of the page in the file *
796
:initarg :offset :type (signed-byte 64))
797
(compressed-page-size :documentation
798
"Size of the page, including header. Sum of compressed_page_size and header
801
:initarg :compressed-page-size :type (signed-byte 32))
802
(first-row-index :documentation
803
"Index within the RowGroup of the first row of the page. When an
804
OffsetIndex is present, pages must begin on row boundaries
805
(repetition_level = 0).
807
:initarg :first-row-index :type (signed-byte 64))))
808
(defclass parquet-offset-index (parquet-object)
809
((page-locations :documentation
810
"PageLocations, ordered by increasing PageLocation.offset. It is required
811
that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
813
:initarg :page-locations :type (vector parquet-page-location))
814
(unencoded-byte-array-data-bytes :documentation
815
"Unencoded\\uncompressed size for BYTE_ARRAY types.
817
See documention for unencoded_byte_array_data_bytes in SizeStatistics for
818
more details on this field.
820
:initarg :unencoded-byte-array-data-bytes :initform nil :type
821
(or null (vector (signed-byte 64)))))
822
(:documentation "Optional offsets for each data page in a ColumnChunk.
824
Forms part of the page index, along with ColumnIndex.
826
OffsetIndex may be present even if ColumnIndex is not.
828
(defclass parquet-column-index (parquet-object)
829
((null-pages :documentation
830
"A list of Boolean values to determine the validity of the corresponding
831
min and max values. If true, a page contains only null values, and writers
832
have to set the corresponding entries in min_values and max_values to
833
byte[0], so that all lists have the same length. If false, the
834
corresponding entries in min_values and max_values must be valid.
836
:initarg :null-pages :type (vector boolean))
837
(min-values :documentation
838
"Two lists containing lower and upper bounds for the values of each page
839
determined by the ColumnOrder of the column. These may be the actual
840
minimum and maximum values found on a page, but can also be (more compact)
841
values that do not exist on a page. For example, instead of storing \\\\Blart
842
Versenwald III\\, a writer may set min_values[i]=\\B\\, max_values[i]=\\C\\.
843
Such more compact values must still be valid values within the column's
844
logical type. Readers must make sure that list entries are populated before
845
using them by inspecting null_pages.
847
:initarg :min-values :type (vector octet-vector))
848
(max-values :initarg :max-values :type (vector octet-vector))
849
(boundary-order :documentation
850
"Stores whether both min_values and max_values are ordered and if so, in
851
which direction. This allows readers to perform binary searches in both
852
lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
853
if the lists are ordered.
855
:initarg :boundary-order :type parquet-boundary-order)
856
(null-counts :documentation
857
"A list containing the number of null values for each page *
859
:initarg :null-counts :initform nil :type
860
(or null (vector (signed-byte 64))))
861
(repetition-level-histograms :documentation
862
"Contains repetition level histograms for each page
863
concatenated together. The repetition_level_histogram field on
864
SizeStatistics contains more details.
866
When present the length should always be (number of pages *
867
(max_repetition_level + 1)) elements.
869
Element 0 is the first element of the histogram for the first page.
870
Element (max_repetition_level + 1) is the first element of the histogram
874
:initarg :repetition-level-histograms :initform nil :type
875
(or null (vector (signed-byte 64))))
876
(definition-level-histograms :documentation
877
"Same as repetition_level_histograms except for definitions levels.
880
:initarg :definition-level-histograms :initform nil :type
881
(or null (vector (signed-byte 64)))))
883
"Optional statistics for each data page in a ColumnChunk.
885
Forms part the page index, along with OffsetIndex.
887
If this structure is present, OffsetIndex must also be present.
889
For each field in this structure, <field>[i] refers to the page at
890
OffsetIndex.page_locations[i]
892
(defclass parquet-aes-gcm-v1 (parquet-object)
893
((aad-prefix :documentation "AAD prefix *
895
:initarg :aad-prefix :initform nil :type (or null octet-vector))
896
(aad-file-unique :documentation
897
"Unique file identifier part of AAD suffix *
899
:initarg :aad-file-unique :initform nil :type
900
(or null octet-vector))
901
(supply-aad-prefix :documentation
902
"In files encrypted with AAD prefix without storing it,
903
readers must supply the prefix *
905
:initarg :supply-aad-prefix :initform nil :type (or null boolean))))
906
(defclass parquet-aes-gcm-ctr-v1 (parquet-object)
907
((aad-prefix :documentation "AAD prefix *
909
:initarg :aad-prefix :initform nil :type (or null octet-vector))
910
(aad-file-unique :documentation
911
"Unique file identifier part of AAD suffix *
913
:initarg :aad-file-unique :initform nil :type
914
(or null octet-vector))
915
(supply-aad-prefix :documentation
916
"In files encrypted with AAD prefix without storing it,
917
readers must supply the prefix *
919
:initarg :supply-aad-prefix :initform nil :type (or null boolean))))
920
(defclass parquet-encryption-algorithm (parquet-object)
921
((aes-gcm-v1 :initarg :aes-gcm-v1 :initform nil :type
922
(or null parquet-aes-gcm-v1))
923
(aes-gcm-ctr-v1 :initarg :aes-gcm-ctr-v1 :initform nil :type
924
(or null parquet-aes-gcm-ctr-v1))))
925
(defclass parquet-file-meta-data (parquet-object)
926
((version :documentation "Version of this file *
928
:initarg :version :type (signed-byte 32))
929
(schema :documentation
930
"Parquet schema for this file. This schema contains metadata for all the columns.
931
The schema is represented as a tree with a single root. The nodes of the tree
932
are flattened to a list by doing a depth-first traversal.
933
The column metadata contains the path in the schema for that column which can be
934
used to map columns to nodes in the schema.
935
The first element is the root *
937
:initarg :schema :type (vector parquet-schema-element))
938
(num-rows :documentation "Number of rows in this file *
940
:initarg :num-rows :type (signed-byte 64))
941
(row-groups :documentation "Row groups in this file *
943
:initarg :row-groups :type (vector parquet-row-group))
944
(key-value-metadata :documentation "Optional key\\value metadata *
946
:initarg :key-value-metadata :initform nil :type
947
(or null (vector parquet-key-value)))
948
(created-by :documentation
949
"String for application that wrote this file. This should be in the format
950
<Application> version <App Version> (build <App Build Hash>).
951
e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
954
:initarg :created-by :initform nil :type (or null string))
955
(column-orders :documentation
956
"Sort order used for the min_value and max_value fields in the Statistics
957
objects and the min_values and max_values fields in the ColumnIndex
958
objects of each column in this file. Sort orders are listed in the order
959
matching the columns in the schema. The indexes are not necessary the same
960
though, because only leaf nodes of the schema are represented in the list
963
Without column_orders, the meaning of the min_value and max_value fields
964
in the Statistics object and the ColumnIndex object is undefined. To ensure
965
well-defined behaviour, if these fields are written to a Parquet file,
966
column_orders must be written as well.
968
The obsolete min and max fields in the Statistics object are always sorted
969
by signed comparison regardless of column_orders.
971
:initarg :column-orders :initform nil :type
972
(or null (vector parquet-column-order)))
973
(encryption-algorithm :documentation
974
"Encryption algorithm. This field is set only in encrypted files
975
with plaintext footer. Files with encrypted footer store algorithm id
976
in FileCryptoMetaData structure.
978
:initarg :encryption-algorithm :initform nil :type
979
(or null parquet-encryption-algorithm))
980
(footer-signing-key-metadata :documentation
981
"Retrieval metadata of key used for signing the footer.
982
Used only in encrypted files with plaintext footer.
984
:initarg :footer-signing-key-metadata :initform nil :type
985
(or null octet-vector)))
986
(:documentation "Description for file metadata
988
(defclass parquet-file-crypto-meta-data (parquet-object)
989
((encryption-algorithm :documentation
990
"Encryption algorithm. This field is only used for files
991
with encrypted footer. Files with plaintext footer store algorithm id
992
inside footer (FileMetaData structure).
994
:initarg :encryption-algorithm :type parquet-encryption-algorithm)
995
(key-metadata :documentation
996
"Retrieval metadata of key used for encryption of footer,
997
and (possibly) columns *
999
:initarg :key-metadata :initform nil :type (or null octet-vector)))
1000
(:documentation "Crypto metadata for files with encrypted footer *