[PR #430] Normalize column types accross kernels

lbustelo · web-flow · commit ce4f62860711 · 2016-07-21T17:15:33.000-05:00
diff --git a/Makefile b/Makefile
@@ -27,8 +27,8 @@ help:
 	@echo '         test-py - run python units'
 	@echo '         test-js - run javascript units'
 	@echo '      test-scala - run scala units'
-	@echo '      test-r     - run r units'
-	@echo '             all - run all necessary streps to produce and validate a build'
+	@echo '          test-r - run r units'
+	@echo '             all - run all necessary steps to produce and validate a build'
 
 # Docker images and repos
 ROOT_REPO:=jupyter/all-spark-notebook:2d878db5cbff
@@ -84,6 +84,7 @@ dev_image:
 		apt-get -qq install --yes nodejs npm && \
 		ln -s /usr/bin/nodejs /usr/bin/node && \
 		npm install -g bower && \
+		pip install pandas==0.18.1 && \
 		Rscript /src-kernel-r/install.r && \
 		mkdir -p /home/jovyan/.local/share/jupyter/nbextensions && \
 		chown -R jovyan:users /home/jovyan/.local/share/jupyter/nbextensions'
diff --git a/elements/urth-core-dataframe/urth-core-dataframe.html b/elements/urth-core-dataframe/urth-core-dataframe.html
@@ -105,7 +105,7 @@
              * The serialized representation of the DataFrame `ref`. The structure contains:
              * {
              *  columns: Array of column names
-             *  column_types: Array of column type names
+             *  columnTypes: Array of column type names
              *  data: 2D Array of rows with column values
              *  index: Array with index values
              * }
@@ -127,7 +127,6 @@
 
             /**
              * Column type metadata for the DataFrame
-             * For internal use ONLY
              */
              columnTypes: {
                 type: Array,
@@ -283,25 +282,42 @@
          */
         onModelValueChange: function(newVal){
             this._debug( "urth-core-dataframe onModelValueChange", newVal );
-            newVal = this._deserializeDateTypes(newVal || {data:[]});
+            newVal = this._deserializeDataTypes(newVal || {data:[]});
             this._setValue(newVal || {data:[]});
         },
 
         /*
-         * Returns the dataFrame with deserialized Date types of the dataFrame reconstructed via the column_types
+         * Returns the dataFrame with deserialized Data types of the dataFrame reconstructed via the columnTypes
          * @param df - the serialized DataFrame
-         * @return the reconstructed DataFrame
+         * @return the reconstructed/deserialized DataFrame data types
          */
-        _deserializeDateTypes: function(df) {
-            if (!df.column_types || !df.data) {
+        _deserializeDataTypes: function(df) {
+            if (!df.columnTypes || !df.data) {
                 return df;
             }
-            for (var i=0; i<df.column_types.length; i++) {
-                //If this column is of type date convert the string content to a date object
-                if(df.column_types[i] === "Date") {
-                    for (var j=0; j<df.data.length; j++) {
-                        df.data[j][i] = new Date(df.data[j][i]);
-                    }
+            
+            var deserializeDataColumn = function(aDf, columnIndex, objectType) {
+                aDf.data = aDf.data.map(function(row) {
+                    row[columnIndex] = new objectType(row[columnIndex]);
+                    return row;
+                });
+                return aDf;
+            };
+
+            for (var i=0; i<df.columnTypes.length; i++) {
+                switch(df.columnTypes[i]) {
+                    case "Date":
+                        df = deserializeDataColumn(df, i, Date);
+                        break;
+                    case "Number":
+                        df = deserializeDataColumn(df, i, Number);
+                        break;
+                    case "Boolean":
+                        df = deserializeDataColumn(df, i, Boolean);
+                        break;
+                    case "String":
+                        df = deserializeDataColumn(df, i, String);
+                        break;
                 }
             }
             return df;
@@ -377,7 +393,7 @@
          * @private
          */
         _columnTypes: function(df){
-            return df.column_types;
+            return df.columnTypes;
         },
 
         /**
diff --git a/etc/docs/site/usage-docs/Serialization-of-data.md b/etc/docs/site/usage-docs/Serialization-of-data.md
@@ -26,11 +26,23 @@ DataFrames from R, Pandas and Spark are serialized into the following structure:
 ```javascript
 {
   columns: [], //array of column names
+  columnTypes: [], //array of column type names
   data: [[]], //2 dimensional array with the data. The outer array holds each row.
   index: [] //index value for each row (partial support)  
 }
 ```
 
+Note: Column types from the native kernel language to the client are resolved/mapped according to the table below:
+
+JS objects| Pandas | pySpark |R data.frame| Spark R| Spark Scala|
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
+Number | int64 | bigint | integer | integer | Int |
+Number | float64 | double | numeric | numeric | Double |
+Boolean | bool | boolean | logical | logical | Boolean
+String | [object (object of ndarray of strings)](http://stackoverflow.com/questions/21018654/strings-in-a-dataframe-but-dtype-is-object) *See type of Unknown | string | character | character | String
+Date | datetime64[ns] | date | Date | Date | TimestampType, DateType|
+Unknown | object or ambiguous type | object or ambiguous type | object or ambiguous type | object or ambiguous type |object or ambiguous type|
+
 ##### Other types
 Depending on the language, other types might have built-in serialization. Here is a list of them per language.
 
diff --git a/etc/docs/site/usage-docs/Using-DataFrames.md b/etc/docs/site/usage-docs/Using-DataFrames.md
@@ -19,7 +19,7 @@ Once the element finds the DataFrame in the kernel, it get the data and makes it
 	```javascript
 {
    columns: [], //array of column names
-   column_types: [], //array of column type names (Note for internal use only)
+   columnTypes: [], //array of column type names
    data: [[]], //2 dimensional array with the data. The outer array holds each row.
    index: [] //index value for each row (partial support)  
 }
@@ -50,7 +50,18 @@ OR
 
 * The `columns` property has an Array of column names
 
-* The `columnTypes` property has an Array of column type names (Note for internal use only)
+* The `columnTypes` property has an Array of column type names
+
+Note: Column types from the native kernel language to the client are resolved/mapped according to the table below:
+
+JS objects| Pandas | pySpark |R data.frame| Spark R| Spark Scala|
+| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
+Number | int64 | bigint | integer | integer | Int |
+Number | float64 | double | numeric | numeric | Double |
+Boolean | bool | boolean | logical | logical | Boolean
+String | [object (object of ndarray of strings)](http://stackoverflow.com/questions/21018654/strings-in-a-dataframe-but-dtype-is-object) *See type of Unknown | string | character | character | String
+Date | datetime64[ns] | date | Date | Date | TimestampType, DateType|
+Unknown | object or ambiguous type | object or ambiguous type | object or ambiguous type | object or ambiguous type |object or ambiguous type|
 
 All property can be used in data bindings to render the data. The example above shows the data displayed as a set of cards, but the same data can be visualize using many of the `<urth-viz-*>` elements.
 
diff --git a/etc/notebooks/examples/urth-core-dataframe.ipynb b/etc/notebooks/examples/urth-core-dataframe.ipynb
@@ -91,7 +91,7 @@
     "<template is=\"dom-bind\">\n",
     "    <urth-core-dataframe ref=\"aDataFrame1\" value=\"{{x}}\"></urth-core-dataframe>\n",
     "    <h4>{{x.columns}}</h4>\n",
-    "    <h4>{{x.column_types}}</h4>\n",
+    "    <h4>{{x.columnTypes}}</h4>\n",
     "    <template is=\"dom-repeat\" items=\"{{x.data}}\">\n",
     "        <div>{{item.0}}, {{item.1}}, {{item.2}}</div>\n",
     "    </template>\n",
diff --git a/etc/notebooks/examples/urth-r-widgets.ipynb b/etc/notebooks/examples/urth-r-widgets.ipynb
@@ -241,7 +241,6 @@
     "            <span class='line phone-number'>{{sel.Amount}}</span>\n",
     "            <span class='line email'>{{sel.Bigger_Number}}</span>\n",
     "            <span class='line website'>{{sel.Website}}</span>\n",
-    "            <span class='line date'>{{sel.Date}}</span>\n",
     "        </div>\n",
     "        <div class='logo'></div>\n",
     "    </div>\n",
@@ -323,7 +322,6 @@
     "            <span class='line phone-number'>{{sel.Amount}}</span>\n",
     "            <span class='line email'>{{sel.Bigger_Number}}</span>\n",
     "            <span class='line website'>{{sel.Website}}</span>\n",
-    "            <span class='line date'>{{sel.Date}}</span>\n",
     "        </div>\n",
     "        <div class='logo'></div>\n",
     "    </div>\n",
diff --git a/etc/notebooks/examples/urth-scala-widgets.ipynb b/etc/notebooks/examples/urth-scala-widgets.ipynb
@@ -312,7 +312,6 @@
     "            <span class=\"line phone-number\">{{item.3}}</span>\n",
     "            <span class=\"line email\">{{item.4}}</span>\n",
     "            <span class=\"line website\">{{item.5}}</span>\n",
-    "            <span class=\"line date\">{{item.6}}</span>\n",
     "        </div>\n",
     "        <div class=\"logo\"></div>\n",
     "      </div>\n",
diff --git a/kernel-python/declarativewidgets/util/serializers.py b/kernel-python/declarativewidgets/util/serializers.py
@@ -29,6 +29,22 @@
 else:
     from .base_serializer_py3 import BaseSerializer
 
+def normalize_type(data_type):
+    """Normalizes pandas/pyspark type names to the equivalent client side type names"""
+    # disregard/normalize precision of the data_type
+    data_type = re.sub("32|64(.*)", "", data_type)
+    return {
+        'int': 'Number',
+        'bigint': 'Number',
+        'float': 'Number',
+        'double': 'Number',
+        'bool': 'Boolean',
+        'boolean': 'Boolean',
+        'string': 'String',
+        'datetime': 'Date',
+        'date': 'Date'
+    }.get(data_type, "Unknown")
+
 class PandasSeriesSerializer(BaseSerializer):
     @staticmethod
     def klass():
@@ -65,10 +81,10 @@ def serialize(obj, **kwargs):
         # {index -> [index], columns -> [columns], data -> [values]}
         date_format = kwargs.get('date_format', 'iso')
         df_dict = json.loads(obj[:limit].to_json(orient='split', date_format=date_format))
-        df_dict['column_types'] = kwargs.get('column_types', [str(x) for x in obj.dtypes.tolist()])
-        df_dict['column_types'] = ["Date" if x == "datetime64[ns]" or x == "date" else x for x in df_dict['column_types']]
-        for i in range(0, len(df_dict['column_types'])):
-            if df_dict['column_types'][i] == "Date":
+        df_dict['columnTypes'] = kwargs.get('columnTypes', [str(x) for x in obj.dtypes.tolist()])
+        df_dict['columnTypes'] = [normalize_type(x) for x in df_dict['columnTypes']]
+        for i in range(0, len(df_dict['columnTypes'])):
+            if df_dict['columnTypes'][i] == "Date":
                 for j in range(0, len(df_dict['data'])):
                     #If this date element has no timezone drop the t/z from the serialized element
                     #Note on filter where rows are dropped we must associate the dict index with the original df index
@@ -132,9 +148,9 @@ def serialize(obj, **kwargs):
         df = pandas.DataFrame.from_records(
             obj.limit(kwargs.get('limit', 100)).collect(), columns=obj.columns)
 
-        #recover column_types from the original object before it is collected/converted
-        column_types = [str(x[1]) for x in obj.dtypes]
-        return PandasDataFrameSerializer.serialize(df, column_types=column_types, **kwargs)
+        #recover columnTypes from the original object before it is collected/converted
+        columnTypes = [str(x[1]) for x in obj.dtypes]
+        return PandasDataFrameSerializer.serialize(df, columnTypes=columnTypes, **kwargs)
 
     @staticmethod
     def check_packages():
diff --git a/kernel-r/declarativewidgets/R/serializers.r b/kernel-r/declarativewidgets/R/serializers.r
@@ -12,8 +12,16 @@ serialize_element <- function(elem) {
 get_df_column_types <- function(df) {
     class_info <- lapply(aDataFrame, class)
     type_info <- list()
-    for (i in class_info) {
-        type_info <- append(type_info, i)
+    for (klass in class_info) {
+        switch(
+            klass,
+            Date        = type_info <- append(type_info, "Date"),
+            integer     = type_info <- append(type_info, "Number"),
+            numeric     = type_info <- append(type_info, "Number"),
+            logical     = type_info <- append(type_info, "Boolean"),
+            character   = type_info <- append(type_info, "String"),
+            type_info <- append(type_info, "Unknown")
+        )
     }
     return (unlist(type_info))
 }
@@ -46,7 +54,7 @@ DataFrame_Serializer <- R6Class(
         serialize = function(obj, row_limit=100) {
             json <- list()
             json[['columns']] <- colnames(obj)
-            json[['column_types']] <- get_df_column_types(obj)
+            json[['columnTypes']] <- get_df_column_types(obj)
             json[['data']] <- self$df_to_lists(obj, row_limit)
             json[['index']] <- as.numeric(rownames(obj))
             return (json)
@@ -88,7 +96,7 @@ Spark_DataFrame_Serializer <- R6Class(
             df <- collect(limit(obj, row_limit))
             json <- list()
             json[['columns']] <- colnames(df)
-            json[['column_types']] <- get_df_column_types(df)
+            json[['columnTypes']] <- get_df_column_types(df)
             json[['data']] <- self$df_to_lists(df)
             json[['index']] <- list(1:nrow(df))
             return (json)
diff --git a/kernel-scala/src/main/scala/declarativewidgets/util/SerializationSupport.scala b/kernel-scala/src/main/scala/declarativewidgets/util/SerializationSupport.scala
@@ -12,6 +12,10 @@ import play.api.libs.json._
 
 import org.apache.spark.sql.types.TimestampType
 import org.apache.spark.sql.types.DateType
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.types.StringType
+import org.apache.spark.sql.types.BooleanType
+import org.apache.spark.sql.types.DoubleType
 
 /**
  * Contains methods for serializing a variable based on its type.
@@ -58,11 +62,15 @@ trait SerializationSupport extends LogLike {
 
       //https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/package-tree.html
       //match possible date types the user may be using
-      val column_types: Array[String] = df.schema.map(
+      val columnTypes: Array[String] = df.schema.map(
                                         x => x.dataType match {
                                           case timeStamp: TimestampType => "Date"
                                           case date: DateType => "Date"
-                                          case _ =>  x.dataType.toString()
+                                          case int: IntegerType => "Number"
+                                          case double: DoubleType => "Number"
+                                          case string: StringType => "String"
+                                          case boolean: BooleanType => "Boolean"
+                                          case _ =>  "Unknown"
                                         }).toArray
 
       val data: Array[Array[JsValue]] = df.toJSON.take(limit).map( jsonRow => toArray(Json.parse(jsonRow).as[JsObject], columns))
@@ -71,7 +79,7 @@ trait SerializationSupport extends LogLike {
 
       Json.obj(
         "columns" -> columns,
-        "column_types" -> column_types,
+        "columnTypes" -> columnTypes,
         "index"   -> index,
         "data"    -> data
       )
diff --git a/kernel-scala/src/test/scala/declarativewidgets/util/SerializationSupportSpec.scala b/kernel-scala/src/test/scala/declarativewidgets/util/SerializationSupportSpec.scala
@@ -131,7 +131,7 @@ class SerializationSupportSpec extends FunSpec with Matchers with MockitoSugar
 
     describe("toArray") {
       it("should return an array in the order specified by columns") {
-        val support = new TestSupport();
+        val support = new TestSupport()
 
         val rowJson = Json.parse(
           """