1
1
"""Computations for plot(df) function."""
2
2
3
- from typing import Any , Dict , List , Optional , Tuple
4
3
from itertools import combinations
4
+ from typing import Any , Dict , List , Optional , Tuple
5
5
6
6
import dask
7
7
import dask .array as da
8
8
import dask .dataframe as dd
9
9
import numpy as np
10
10
import pandas as pd
11
- from dask .array .stats import chisquare , normaltest , skew
12
- from scipy .stats import ks_2samp
11
+ from dask .array .stats import chisquare , skew
13
12
14
13
from ....errors import UnreachableError
15
14
from ...dtypes import (
24
23
is_dtype ,
25
24
)
26
25
from ...intermediate import Intermediate
27
- from .common import _calc_line_dt
26
+ from .common import _calc_line_dt , ks_2samp , normaltest
28
27
29
28
30
29
def compute_overview (
@@ -81,9 +80,7 @@ def compute_overview(
81
80
first_rows [col ].apply (hash )
82
81
except TypeError :
83
82
srs = df [col ] = srs .astype (str )
84
- datas .append (
85
- calc_nom_col (drop_null (srs ), first_rows [col ], ngroups , largest )
86
- )
83
+ datas .append (calc_nom_col (drop_null (srs ), ngroups , largest ))
87
84
col_names_dtypes .append ((col , Nominal ()))
88
85
elif is_dtype (col_dtype , Continuous ()):
89
86
## if cfg.hist_enable or cfg.any_insights("hist"):
@@ -179,9 +176,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
179
176
180
177
181
178
## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
182
- def calc_nom_col (
183
- srs : dd .Series , first_rows : pd .Series , ngroups : int , largest : bool
184
- ) -> Dict [str , Any ]:
179
+ def calc_nom_col (srs : dd .Series , ngroups : int , largest : bool ) -> Dict [str , Any ]:
185
180
"""
186
181
Computations for a categorical column in plot(df)
187
182
@@ -227,9 +222,7 @@ def calc_nom_col(
227
222
## data["npresent"] = srs.shape[0]
228
223
229
224
## if cfg.insight.constant_length_enable:
230
- if not first_rows .apply (lambda x : isinstance (x , str )).all ():
231
- srs = srs .astype (str ) # srs must be a string to compute the value lengths
232
- length = srs .str .len ()
225
+ length = srs .apply (lambda v : len (str (v )), meta = (srs .name , np .int64 ))
233
226
data ["min_len" ], data ["max_len" ] = length .min (), length .max ()
234
227
235
228
return data
@@ -269,12 +262,13 @@ def calc_stats(
269
262
# compute distribution similarity on a data sample
270
263
# TODO .map_partitions() fails for create_report since it calls calc_stats() with a pd dataframe
271
264
# df_smp = df.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=df)
272
- # NOTE ks_2samp triggers a .compute(), could use .delayed()
265
+
273
266
if num_cols : # remove this if statement when create_report is refactored
274
267
stats ["ks_tests" ] = []
275
268
for col1 , col2 in list (combinations (num_cols , 2 )):
276
- if ks_2samp (df [col1 ], df [col2 ])[1 ] > 0.05 :
277
- stats ["ks_tests" ].append ((col1 , col2 ))
269
+ stats ["ks_tests" ].append (
270
+ (col1 , col2 , ks_2samp (df [col1 ], df [col2 ])[1 ] > 0.05 )
271
+ )
278
272
279
273
return stats
280
274
@@ -299,9 +293,10 @@ def format_overview(data: Dict[str, Any]) -> List[Dict[str, str]]:
299
293
ins .append ({"Duplicates" : f"Dataset has { ndup } ({ pdup } %) duplicate rows" })
300
294
301
295
## if cfg.insight.similar_distribution_enable
302
- for cols in data .get ("ks_tests" , []):
303
- msg = f"{ cols [0 ]} and { cols [1 ]} have similar distributions"
304
- ins .append ({"Similar Distribution" : msg })
296
+ for (* cols , test_result ) in data .get ("ks_tests" , []):
297
+ if test_result :
298
+ msg = f"{ cols [0 ]} and { cols [1 ]} have similar distributions"
299
+ ins .append ({"Similar Distribution" : msg })
305
300
306
301
data .pop ("ks_tests" , None )
307
302
0 commit comments