Skip to content
This repository was archived by the owner on Sep 1, 2022. It is now read-only.

Commit ce4f628

Browse files
authored
[PR #430] Normalize column types accross kernels
2 parents dce0890 + c4237cb commit ce4f628

File tree

11 files changed

+106
-37
lines changed

11 files changed

+106
-37
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ help:
2727
@echo ' test-py - run python units'
2828
@echo ' test-js - run javascript units'
2929
@echo ' test-scala - run scala units'
30-
@echo ' test-r - run r units'
31-
@echo ' all - run all necessary streps to produce and validate a build'
30+
@echo ' test-r - run r units'
31+
@echo ' all - run all necessary steps to produce and validate a build'
3232

3333
# Docker images and repos
3434
ROOT_REPO:=jupyter/all-spark-notebook:2d878db5cbff
@@ -84,6 +84,7 @@ dev_image:
8484
apt-get -qq install --yes nodejs npm && \
8585
ln -s /usr/bin/nodejs /usr/bin/node && \
8686
npm install -g bower && \
87+
pip install pandas==0.18.1 && \
8788
Rscript /src-kernel-r/install.r && \
8889
mkdir -p /home/jovyan/.local/share/jupyter/nbextensions && \
8990
chown -R jovyan:users /home/jovyan/.local/share/jupyter/nbextensions'

elements/urth-core-dataframe/urth-core-dataframe.html

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105
* The serialized representation of the DataFrame `ref`. The structure contains:
106106
* {
107107
* columns: Array of column names
108-
* column_types: Array of column type names
108+
* columnTypes: Array of column type names
109109
* data: 2D Array of rows with column values
110110
* index: Array with index values
111111
* }
@@ -127,7 +127,6 @@
127127

128128
/**
129129
* Column type metadata for the DataFrame
130-
* For internal use ONLY
131130
*/
132131
columnTypes: {
133132
type: Array,
@@ -283,25 +282,42 @@
283282
*/
284283
onModelValueChange: function(newVal){
285284
this._debug( "urth-core-dataframe onModelValueChange", newVal );
286-
newVal = this._deserializeDateTypes(newVal || {data:[]});
285+
newVal = this._deserializeDataTypes(newVal || {data:[]});
287286
this._setValue(newVal || {data:[]});
288287
},
289288

290289
/*
291-
* Returns the dataFrame with deserialized Date types of the dataFrame reconstructed via the column_types
290+
* Returns the dataFrame with deserialized Data types of the dataFrame reconstructed via the columnTypes
292291
* @param df - the serialized DataFrame
293-
* @return the reconstructed DataFrame
292+
* @return the reconstructed/deserialized DataFrame data types
294293
*/
295-
_deserializeDateTypes: function(df) {
296-
if (!df.column_types || !df.data) {
294+
_deserializeDataTypes: function(df) {
295+
if (!df.columnTypes || !df.data) {
297296
return df;
298297
}
299-
for (var i=0; i<df.column_types.length; i++) {
300-
//If this column is of type date convert the string content to a date object
301-
if(df.column_types[i] === "Date") {
302-
for (var j=0; j<df.data.length; j++) {
303-
df.data[j][i] = new Date(df.data[j][i]);
304-
}
298+
299+
var deserializeDataColumn = function(aDf, columnIndex, objectType) {
300+
aDf.data = aDf.data.map(function(row) {
301+
row[columnIndex] = new objectType(row[columnIndex]);
302+
return row;
303+
});
304+
return aDf;
305+
};
306+
307+
for (var i=0; i<df.columnTypes.length; i++) {
308+
switch(df.columnTypes[i]) {
309+
case "Date":
310+
df = deserializeDataColumn(df, i, Date);
311+
break;
312+
case "Number":
313+
df = deserializeDataColumn(df, i, Number);
314+
break;
315+
case "Boolean":
316+
df = deserializeDataColumn(df, i, Boolean);
317+
break;
318+
case "String":
319+
df = deserializeDataColumn(df, i, String);
320+
break;
305321
}
306322
}
307323
return df;
@@ -377,7 +393,7 @@
377393
* @private
378394
*/
379395
_columnTypes: function(df){
380-
return df.column_types;
396+
return df.columnTypes;
381397
},
382398

383399
/**

etc/docs/site/usage-docs/Serialization-of-data.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,23 @@ DataFrames from R, Pandas and Spark are serialized into the following structure:
2626
```javascript
2727
{
2828
columns: [], //array of column names
29+
columnTypes: [], //array of column type names
2930
data: [[]], //2 dimensional array with the data. The outer array holds each row.
3031
index: [] //index value for each row (partial support)
3132
}
3233
```
3334

35+
Note: Column types from the native kernel language to the client are resolved/mapped according to the table below:
36+
37+
JS objects| Pandas | pySpark |R data.frame| Spark R| Spark Scala|
38+
| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
39+
Number | int64 | bigint | integer | integer | Int |
40+
Number | float64 | double | numeric | numeric | Double |
41+
Boolean | bool | boolean | logical | logical | Boolean
42+
String | [object (object of ndarray of strings)](http://stackoverflow.com/questions/21018654/strings-in-a-dataframe-but-dtype-is-object) *See type of Unknown | string | character | character | String
43+
Date | datetime64[ns] | date | Date | Date | TimestampType, DateType|
44+
Unknown | object or ambiguous type | object or ambiguous type | object or ambiguous type | object or ambiguous type |object or ambiguous type|
45+
3446
##### Other types
3547
Depending on the language, other types might have built-in serialization. Here is a list of them per language.
3648

etc/docs/site/usage-docs/Using-DataFrames.md

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Once the element finds the DataFrame in the kernel, it get the data and makes it
1919
```javascript
2020
{
2121
columns: [], //array of column names
22-
column_types: [], //array of column type names (Note for internal use only)
22+
columnTypes: [], //array of column type names
2323
data: [[]], //2 dimensional array with the data. The outer array holds each row.
2424
index: [] //index value for each row (partial support)
2525
}
@@ -50,7 +50,18 @@ OR
5050

5151
* The `columns` property has an Array of column names
5252

53-
* The `columnTypes` property has an Array of column type names (Note for internal use only)
53+
* The `columnTypes` property has an Array of column type names
54+
55+
Note: Column types from the native kernel language to the client are resolved/mapped according to the table below:
56+
57+
JS objects| Pandas | pySpark |R data.frame| Spark R| Spark Scala|
58+
| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
59+
Number | int64 | bigint | integer | integer | Int |
60+
Number | float64 | double | numeric | numeric | Double |
61+
Boolean | bool | boolean | logical | logical | Boolean
62+
String | [object (object of ndarray of strings)](http://stackoverflow.com/questions/21018654/strings-in-a-dataframe-but-dtype-is-object) *See type of Unknown | string | character | character | String
63+
Date | datetime64[ns] | date | Date | Date | TimestampType, DateType|
64+
Unknown | object or ambiguous type | object or ambiguous type | object or ambiguous type | object or ambiguous type |object or ambiguous type|
5465

5566
All property can be used in data bindings to render the data. The example above shows the data displayed as a set of cards, but the same data can be visualize using many of the `<urth-viz-*>` elements.
5667

etc/notebooks/examples/urth-core-dataframe.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
"<template is=\"dom-bind\">\n",
9292
" <urth-core-dataframe ref=\"aDataFrame1\" value=\"{{x}}\"></urth-core-dataframe>\n",
9393
" <h4>{{x.columns}}</h4>\n",
94-
" <h4>{{x.column_types}}</h4>\n",
94+
" <h4>{{x.columnTypes}}</h4>\n",
9595
" <template is=\"dom-repeat\" items=\"{{x.data}}\">\n",
9696
" <div>{{item.0}}, {{item.1}}, {{item.2}}</div>\n",
9797
" </template>\n",

etc/notebooks/examples/urth-r-widgets.ipynb

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,6 @@
241241
" <span class='line phone-number'>{{sel.Amount}}</span>\n",
242242
" <span class='line email'>{{sel.Bigger_Number}}</span>\n",
243243
" <span class='line website'>{{sel.Website}}</span>\n",
244-
" <span class='line date'>{{sel.Date}}</span>\n",
245244
" </div>\n",
246245
" <div class='logo'></div>\n",
247246
" </div>\n",
@@ -323,7 +322,6 @@
323322
" <span class='line phone-number'>{{sel.Amount}}</span>\n",
324323
" <span class='line email'>{{sel.Bigger_Number}}</span>\n",
325324
" <span class='line website'>{{sel.Website}}</span>\n",
326-
" <span class='line date'>{{sel.Date}}</span>\n",
327325
" </div>\n",
328326
" <div class='logo'></div>\n",
329327
" </div>\n",

etc/notebooks/examples/urth-scala-widgets.ipynb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,6 @@
312312
" <span class=\"line phone-number\">{{item.3}}</span>\n",
313313
" <span class=\"line email\">{{item.4}}</span>\n",
314314
" <span class=\"line website\">{{item.5}}</span>\n",
315-
" <span class=\"line date\">{{item.6}}</span>\n",
316315
" </div>\n",
317316
" <div class=\"logo\"></div>\n",
318317
" </div>\n",

kernel-python/declarativewidgets/util/serializers.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,22 @@
2929
else:
3030
from .base_serializer_py3 import BaseSerializer
3131

32+
def normalize_type(data_type):
33+
"""Normalizes pandas/pyspark type names to the equivalent client side type names"""
34+
# disregard/normalize precision of the data_type
35+
data_type = re.sub("32|64(.*)", "", data_type)
36+
return {
37+
'int': 'Number',
38+
'bigint': 'Number',
39+
'float': 'Number',
40+
'double': 'Number',
41+
'bool': 'Boolean',
42+
'boolean': 'Boolean',
43+
'string': 'String',
44+
'datetime': 'Date',
45+
'date': 'Date'
46+
}.get(data_type, "Unknown")
47+
3248
class PandasSeriesSerializer(BaseSerializer):
3349
@staticmethod
3450
def klass():
@@ -65,10 +81,10 @@ def serialize(obj, **kwargs):
6581
# {index -> [index], columns -> [columns], data -> [values]}
6682
date_format = kwargs.get('date_format', 'iso')
6783
df_dict = json.loads(obj[:limit].to_json(orient='split', date_format=date_format))
68-
df_dict['column_types'] = kwargs.get('column_types', [str(x) for x in obj.dtypes.tolist()])
69-
df_dict['column_types'] = ["Date" if x == "datetime64[ns]" or x == "date" else x for x in df_dict['column_types']]
70-
for i in range(0, len(df_dict['column_types'])):
71-
if df_dict['column_types'][i] == "Date":
84+
df_dict['columnTypes'] = kwargs.get('columnTypes', [str(x) for x in obj.dtypes.tolist()])
85+
df_dict['columnTypes'] = [normalize_type(x) for x in df_dict['columnTypes']]
86+
for i in range(0, len(df_dict['columnTypes'])):
87+
if df_dict['columnTypes'][i] == "Date":
7288
for j in range(0, len(df_dict['data'])):
7389
#If this date element has no timezone drop the t/z from the serialized element
7490
#Note on filter where rows are dropped we must associate the dict index with the original df index
@@ -132,9 +148,9 @@ def serialize(obj, **kwargs):
132148
df = pandas.DataFrame.from_records(
133149
obj.limit(kwargs.get('limit', 100)).collect(), columns=obj.columns)
134150

135-
#recover column_types from the original object before it is collected/converted
136-
column_types = [str(x[1]) for x in obj.dtypes]
137-
return PandasDataFrameSerializer.serialize(df, column_types=column_types, **kwargs)
151+
#recover columnTypes from the original object before it is collected/converted
152+
columnTypes = [str(x[1]) for x in obj.dtypes]
153+
return PandasDataFrameSerializer.serialize(df, columnTypes=columnTypes, **kwargs)
138154

139155
@staticmethod
140156
def check_packages():

kernel-r/declarativewidgets/R/serializers.r

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,16 @@ serialize_element <- function(elem) {
1212
get_df_column_types <- function(df) {
1313
class_info <- lapply(aDataFrame, class)
1414
type_info <- list()
15-
for (i in class_info) {
16-
type_info <- append(type_info, i)
15+
for (klass in class_info) {
16+
switch(
17+
klass,
18+
Date = type_info <- append(type_info, "Date"),
19+
integer = type_info <- append(type_info, "Number"),
20+
numeric = type_info <- append(type_info, "Number"),
21+
logical = type_info <- append(type_info, "Boolean"),
22+
character = type_info <- append(type_info, "String"),
23+
type_info <- append(type_info, "Unknown")
24+
)
1725
}
1826
return (unlist(type_info))
1927
}
@@ -46,7 +54,7 @@ DataFrame_Serializer <- R6Class(
4654
serialize = function(obj, row_limit=100) {
4755
json <- list()
4856
json[['columns']] <- colnames(obj)
49-
json[['column_types']] <- get_df_column_types(obj)
57+
json[['columnTypes']] <- get_df_column_types(obj)
5058
json[['data']] <- self$df_to_lists(obj, row_limit)
5159
json[['index']] <- as.numeric(rownames(obj))
5260
return (json)
@@ -88,7 +96,7 @@ Spark_DataFrame_Serializer <- R6Class(
8896
df <- collect(limit(obj, row_limit))
8997
json <- list()
9098
json[['columns']] <- colnames(df)
91-
json[['column_types']] <- get_df_column_types(df)
99+
json[['columnTypes']] <- get_df_column_types(df)
92100
json[['data']] <- self$df_to_lists(df)
93101
json[['index']] <- list(1:nrow(df))
94102
return (json)

kernel-scala/src/main/scala/declarativewidgets/util/SerializationSupport.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ import play.api.libs.json._
1212

1313
import org.apache.spark.sql.types.TimestampType
1414
import org.apache.spark.sql.types.DateType
15+
import org.apache.spark.sql.types.IntegerType
16+
import org.apache.spark.sql.types.StringType
17+
import org.apache.spark.sql.types.BooleanType
18+
import org.apache.spark.sql.types.DoubleType
1519

1620
/**
1721
* Contains methods for serializing a variable based on its type.
@@ -58,11 +62,15 @@ trait SerializationSupport extends LogLike {
5862

5963
//https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/package-tree.html
6064
//match possible date types the user may be using
61-
val column_types: Array[String] = df.schema.map(
65+
val columnTypes: Array[String] = df.schema.map(
6266
x => x.dataType match {
6367
case timeStamp: TimestampType => "Date"
6468
case date: DateType => "Date"
65-
case _ => x.dataType.toString()
69+
case int: IntegerType => "Number"
70+
case double: DoubleType => "Number"
71+
case string: StringType => "String"
72+
case boolean: BooleanType => "Boolean"
73+
case _ => "Unknown"
6674
}).toArray
6775

6876
val data: Array[Array[JsValue]] = df.toJSON.take(limit).map( jsonRow => toArray(Json.parse(jsonRow).as[JsObject], columns))
@@ -71,7 +79,7 @@ trait SerializationSupport extends LogLike {
7179

7280
Json.obj(
7381
"columns" -> columns,
74-
"column_types" -> column_types,
82+
"columnTypes" -> columnTypes,
7583
"index" -> index,
7684
"data" -> data
7785
)

kernel-scala/src/test/scala/declarativewidgets/util/SerializationSupportSpec.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ class SerializationSupportSpec extends FunSpec with Matchers with MockitoSugar
131131

132132
describe("toArray") {
133133
it("should return an array in the order specified by columns") {
134-
val support = new TestSupport();
134+
val support = new TestSupport()
135135

136136
val rowJson = Json.parse(
137137
"""

0 commit comments

Comments
 (0)