Skip to content

Commit d59f91a

Browse files
authored
feat: Add better roundtrip support (#142)
Follow-up for #141 Also notice apache/arrow#38891
1 parent 8c09cbb commit d59f91a

File tree

8 files changed

+306
-106
lines changed

8 files changed

+306
-106
lines changed

Diff for: lib/src/main/java/io/cloudquery/helper/ArrowHelper.java

+107-60
Original file line numberDiff line numberDiff line change
@@ -8,45 +8,30 @@
88
import io.cloudquery.schema.Resource;
99
import io.cloudquery.schema.Table;
1010
import io.cloudquery.schema.Table.TableBuilder;
11+
import io.cloudquery.types.JSONType;
1112
import io.cloudquery.types.JSONType.JSONVector;
13+
import io.cloudquery.types.UUIDType;
1214
import io.cloudquery.types.UUIDType.UUIDVector;
1315
import java.io.ByteArrayOutputStream;
1416
import java.io.IOException;
1517
import java.nio.channels.Channels;
16-
import java.util.ArrayList;
17-
import java.util.HashMap;
18-
import java.util.List;
19-
import java.util.Map;
20-
import java.util.Objects;
18+
import java.time.Duration;
19+
import java.util.*;
2120
import org.apache.arrow.memory.BufferAllocator;
2221
import org.apache.arrow.memory.RootAllocator;
23-
import org.apache.arrow.vector.BigIntVector;
24-
import org.apache.arrow.vector.BitVector;
25-
import org.apache.arrow.vector.DateDayVector;
26-
import org.apache.arrow.vector.FieldVector;
27-
import org.apache.arrow.vector.FixedSizeBinaryVector;
28-
import org.apache.arrow.vector.Float4Vector;
29-
import org.apache.arrow.vector.Float8Vector;
30-
import org.apache.arrow.vector.IntVector;
31-
import org.apache.arrow.vector.LargeVarBinaryVector;
32-
import org.apache.arrow.vector.LargeVarCharVector;
33-
import org.apache.arrow.vector.SmallIntVector;
34-
import org.apache.arrow.vector.TimeStampVector;
35-
import org.apache.arrow.vector.TinyIntVector;
36-
import org.apache.arrow.vector.UInt1Vector;
37-
import org.apache.arrow.vector.UInt2Vector;
38-
import org.apache.arrow.vector.UInt4Vector;
39-
import org.apache.arrow.vector.UInt8Vector;
40-
import org.apache.arrow.vector.VarBinaryVector;
41-
import org.apache.arrow.vector.VarCharVector;
42-
import org.apache.arrow.vector.VectorSchemaRoot;
22+
import org.apache.arrow.vector.*;
4323
import org.apache.arrow.vector.ipc.ArrowReader;
4424
import org.apache.arrow.vector.ipc.ArrowStreamReader;
4525
import org.apache.arrow.vector.ipc.ArrowStreamWriter;
26+
import org.apache.arrow.vector.types.pojo.ArrowType;
4627
import org.apache.arrow.vector.types.pojo.Field;
4728
import org.apache.arrow.vector.types.pojo.FieldType;
4829
import org.apache.arrow.vector.types.pojo.Schema;
4930
import org.apache.arrow.vector.util.Text;
31+
import org.joou.UByte;
32+
import org.joou.UInteger;
33+
import org.joou.ULong;
34+
import org.joou.UShort;
5035

5136
public class ArrowHelper {
5237
public static final String CQ_EXTENSION_INCREMENTAL = "cq:extension:incremental";
@@ -72,6 +57,32 @@ private static void setVectorData(FieldVector vector, Object data) {
7257
bitVector.set(0, (boolean) data ? 1 : 0);
7358
return;
7459
}
60+
if (vector instanceof DateDayVector dayDateVector) {
61+
dayDateVector.set(0, (int) data);
62+
return;
63+
}
64+
if (vector instanceof DateMilliVector dateMilliVector) {
65+
dateMilliVector.set(0, (long) data);
66+
return;
67+
}
68+
if (vector instanceof DurationVector durationVector) {
69+
Duration duration = (Duration) data;
70+
switch (durationVector.getUnit()) {
71+
case SECOND -> {
72+
durationVector.set(0, duration.toSeconds());
73+
}
74+
case MILLISECOND -> {
75+
durationVector.set(0, duration.toMillis());
76+
}
77+
case MICROSECOND -> {
78+
durationVector.set(0, duration.toNanos() / 1000);
79+
}
80+
case NANOSECOND -> {
81+
durationVector.set(0, duration.toNanos());
82+
}
83+
}
84+
return;
85+
}
7586
if (vector instanceof FixedSizeBinaryVector fixedSizeBinaryVector) {
7687
fixedSizeBinaryVector.set(0, (byte[]) data);
7788
return;
@@ -100,6 +111,22 @@ private static void setVectorData(FieldVector vector, Object data) {
100111
smallIntVector.set(0, (short) data);
101112
return;
102113
}
114+
if (vector instanceof TimeMicroVector timeMicroVector) {
115+
timeMicroVector.set(0, (long) data);
116+
return;
117+
}
118+
if (vector instanceof TimeMilliVector timeMilliVector) {
119+
timeMilliVector.set(0, (int) data);
120+
return;
121+
}
122+
if (vector instanceof TimeNanoVector timeNanoVector) {
123+
timeNanoVector.set(0, (long) data);
124+
return;
125+
}
126+
if (vector instanceof TimeSecVector timeSecVector) {
127+
timeSecVector.set(0, (int) data);
128+
return;
129+
}
103130
if (vector instanceof TimeStampVector timeStampVector) {
104131
timeStampVector.set(0, (long) data);
105132
return;
@@ -109,19 +136,19 @@ private static void setVectorData(FieldVector vector, Object data) {
109136
return;
110137
}
111138
if (vector instanceof UInt1Vector uInt1Vector) {
112-
uInt1Vector.set(0, (byte) data);
139+
uInt1Vector.set(0, ((UByte) data).shortValue());
113140
return;
114141
}
115142
if (vector instanceof UInt2Vector uInt2Vector) {
116-
uInt2Vector.set(0, (short) data);
143+
uInt2Vector.set(0, ((UShort) data).intValue());
117144
return;
118145
}
119146
if (vector instanceof UInt4Vector uInt4Vector) {
120-
uInt4Vector.set(0, (int) data);
147+
uInt4Vector.set(0, ((UInteger) data).intValue());
121148
return;
122149
}
123150
if (vector instanceof UInt8Vector uInt8Vector) {
124-
uInt8Vector.set(0, (long) data);
151+
uInt8Vector.set(0, ((ULong) data).longValue());
125152
return;
126153
}
127154
if (vector instanceof VarBinaryVector varBinaryVector) {
@@ -132,16 +159,14 @@ private static void setVectorData(FieldVector vector, Object data) {
132159
vectorCharVector.set(0, (Text) data);
133160
return;
134161
}
135-
if (vector instanceof UUIDVector uuidVector) {
136-
uuidVector.set(0, (java.util.UUID) data);
137-
return;
138-
}
162+
// CloudQuery-specific
139163
if (vector instanceof JSONVector jsonVector) {
140164
jsonVector.setSafe(0, (byte[]) data);
141165
return;
142166
}
143-
if (vector instanceof DateDayVector dayDateVector) {
144-
dayDateVector.set(0, (int) data);
167+
// CloudQuery-specific
168+
if (vector instanceof UUIDVector uuidVector) {
169+
uuidVector.set(0, (java.util.UUID) data);
145170
return;
146171
}
147172

@@ -177,17 +202,7 @@ public static Schema toArrowSchema(Table table) {
177202
List<Column> columns = table.getColumns();
178203
Field[] fields = new Field[columns.size()];
179204
for (int i = 0; i < columns.size(); i++) {
180-
Column column = columns.get(i);
181-
Map<String, String> metadata = new HashMap<>();
182-
metadata.put(CQ_EXTENSION_UNIQUE, Boolean.toString(column.isUnique()));
183-
metadata.put(CQ_EXTENSION_PRIMARY_KEY, Boolean.toString(column.isPrimaryKey()));
184-
metadata.put(CQ_EXTENSION_INCREMENTAL, Boolean.toString(column.isIncrementalKey()));
185-
Field field =
186-
new Field(
187-
column.getName(),
188-
new FieldType(!column.isNotNull(), column.getType(), null, metadata),
189-
null);
190-
fields[i] = field;
205+
fields[i] = getField(columns.get(i));
191206
}
192207
Map<String, String> metadata = new HashMap<>();
193208
metadata.put(CQ_TABLE_NAME, table.getName());
@@ -204,23 +219,21 @@ public static Schema toArrowSchema(Table table) {
204219
return new Schema(asList(fields), metadata);
205220
}
206221

222+
private static Field getField(Column column) {
223+
Map<String, String> metadata = new HashMap<>();
224+
metadata.put(CQ_EXTENSION_UNIQUE, Boolean.toString(column.isUnique()));
225+
metadata.put(CQ_EXTENSION_PRIMARY_KEY, Boolean.toString(column.isPrimaryKey()));
226+
metadata.put(CQ_EXTENSION_INCREMENTAL, Boolean.toString(column.isIncrementalKey()));
227+
return new Field(
228+
column.getName(),
229+
new FieldType(!column.isNotNull(), column.getType(), null, metadata),
230+
null);
231+
}
232+
207233
public static Table fromArrowSchema(Schema schema) {
208234
List<Column> columns = new ArrayList<>();
209235
for (Field field : schema.getFields()) {
210-
boolean isUnique = Objects.equals(field.getMetadata().get(CQ_EXTENSION_UNIQUE), "true");
211-
boolean isPrimaryKey =
212-
Objects.equals(field.getMetadata().get(CQ_EXTENSION_PRIMARY_KEY), "true");
213-
boolean isIncrementalKey =
214-
Objects.equals(field.getMetadata().get(CQ_EXTENSION_INCREMENTAL), "true");
215-
216-
columns.add(
217-
Column.builder()
218-
.name(field.getName())
219-
.unique(isUnique)
220-
.primaryKey(isPrimaryKey)
221-
.incrementalKey(isIncrementalKey)
222-
.type(field.getType())
223-
.build());
236+
columns.add(getColumn(field));
224237
}
225238

226239
Map<String, String> metaData = schema.getCustomMetadata();
@@ -244,6 +257,40 @@ public static Table fromArrowSchema(Schema schema) {
244257
return tableBuilder.build();
245258
}
246259

260+
private static Column getColumn(Field field) {
261+
boolean isUnique = Objects.equals(field.getMetadata().get(CQ_EXTENSION_UNIQUE), "true");
262+
boolean isPrimaryKey =
263+
Objects.equals(field.getMetadata().get(CQ_EXTENSION_PRIMARY_KEY), "true");
264+
boolean isIncrementalKey =
265+
Objects.equals(field.getMetadata().get(CQ_EXTENSION_INCREMENTAL), "true");
266+
267+
ArrowType fieldType = field.getType();
268+
String extensionName =
269+
field.getMetadata().get(ArrowType.ExtensionType.EXTENSION_METADATA_KEY_NAME);
270+
String extensionMetadata =
271+
field.getMetadata().get(ArrowType.ExtensionType.EXTENSION_METADATA_KEY_METADATA);
272+
273+
// We need to scan our extension types manually because of
274+
// https://github.com/apache/arrow/issues/38891
275+
if (JSONType.EXTENSION_NAME.equals(extensionName)
276+
&& JSONType.INSTANCE.serialize().equals(extensionMetadata)
277+
&& JSONType.INSTANCE.storageType().equals(fieldType)) {
278+
fieldType = JSONType.INSTANCE;
279+
} else if (UUIDType.EXTENSION_NAME.equals(extensionName)
280+
&& UUIDType.INSTANCE.serialize().equals(extensionMetadata)
281+
&& UUIDType.INSTANCE.storageType().equals(fieldType)) {
282+
fieldType = UUIDType.INSTANCE;
283+
}
284+
285+
return Column.builder()
286+
.name(field.getName())
287+
.unique(isUnique)
288+
.primaryKey(isPrimaryKey)
289+
.incrementalKey(isIncrementalKey)
290+
.type(fieldType)
291+
.build();
292+
}
293+
247294
public static ByteString encode(Resource resource) throws IOException {
248295
try (BufferAllocator bufferAllocator = new RootAllocator()) {
249296
Table table = resource.getTable();

Diff for: lib/src/main/java/io/cloudquery/scalar/DateMilli.java

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package io.cloudquery.scalar;
22

3+
import java.time.LocalDateTime;
34
import org.apache.arrow.vector.types.DateUnit;
45
import org.apache.arrow.vector.types.pojo.ArrowType;
56

@@ -34,6 +35,12 @@ public void setValue(Object value) throws ValidationException {
3435
return;
3536
}
3637

38+
if (value instanceof LocalDateTime localDateTime) {
39+
// we actually store only date
40+
this.value = localDateTime.toLocalDate().toEpochDay();
41+
return;
42+
}
43+
3744
throw new ValidationException(
3845
ValidationException.NO_CONVERSION_AVAILABLE, this.dataType(), value);
3946
}

Diff for: lib/src/main/java/io/cloudquery/scalar/Number.java

+5
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ protected void setValue(Object value) throws ValidationException {
207207
return;
208208
}
209209

210+
if (value instanceof Character character) {
211+
this.value = UShort.valueOf(character);
212+
return;
213+
}
214+
210215
throw new ValidationException(
211216
ValidationException.NO_CONVERSION_AVAILABLE, this.dataType(), value);
212217
}

Diff for: lib/src/main/java/io/cloudquery/transformers/TypeTransformer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ private static ArrowType transformArrowType(String name, Class<?> type)
3939
return Timestamp.dt;
4040
}
4141
case "java.util.UUID" -> {
42-
return new UUIDType();
42+
return UUIDType.INSTANCE;
4343
}
4444
default -> {
4545
if (type.isArray()) {

Diff for: lib/src/main/java/io/cloudquery/types/UUIDType.java

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import org.apache.arrow.vector.types.pojo.FieldType;
1313

1414
public class UUIDType extends ExtensionType {
15+
public static final UUIDType INSTANCE = new UUIDType();
1516
public static final int BYTE_WIDTH = 16;
1617
public static final String EXTENSION_NAME = "uuid";
1718

0 commit comments

Comments
 (0)