13
13
import pytest
14
14
15
15
from delphi_utils .archive import ArchiveDiffer , GitArchiveDiffer , S3ArchiveDiffer ,\
16
- archiver_from_params
16
+ archiver_from_params , Nans
17
17
18
- CSV_DTYPES = {"geo_id" : str , "val" : float , "se" : float , "sample_size" : float }
18
+ CSV_DTYPES = {
19
+ "geo_id" : str , "val" : float , "se" : float , "sample_size" : float ,
20
+ "missing_val" : int , "missing_se" :int , "missing_sample_size" : int
21
+ }
19
22
20
23
CSVS_BEFORE = {
21
24
# Common
22
25
"csv0" : pd .DataFrame ({
23
26
"geo_id" : ["1" , "2" , "3" ],
24
27
"val" : [1.000000001 , 2.00000002 , 3.00000003 ],
25
28
"se" : [0.1 , 0.2 , 0.3 ],
26
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
29
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
30
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
31
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
32
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
33
+ }),
27
34
28
35
"csv1" : pd .DataFrame ({
29
36
"geo_id" : ["1" , "2" , "3" ],
30
37
"val" : [1.0 , 2.0 , 3.0 ],
31
38
"se" : [np .nan , 0.20000002 , 0.30000003 ],
32
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
39
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
40
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
41
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
42
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
43
+ }),
33
44
34
45
# Deleted
35
46
"csv2" : pd .DataFrame ({
36
47
"geo_id" : ["1" ],
37
48
"val" : [1.0 ],
38
49
"se" : [0.1 ],
39
- "sample_size" : [10.0 ]}),
50
+ "sample_size" : [10.0 ],
51
+ "missing_val" : [Nans .NOT_MISSING ],
52
+ "missing_se" : [Nans .NOT_MISSING ],
53
+ "missing_sample_size" : [Nans .NOT_MISSING ],
54
+ }),
40
55
}
41
56
42
57
CSVS_AFTER = {
45
60
"geo_id" : ["1" , "2" , "3" ],
46
61
"val" : [1.0 , 2.0 , 3.0 ],
47
62
"se" : [0.10000001 , 0.20000002 , 0.30000003 ],
48
- "sample_size" : [10.0 , 20.0 , 30.0 ]}),
63
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
64
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
65
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
66
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
67
+ }),
49
68
50
69
"csv1" : pd .DataFrame ({
51
70
"geo_id" : ["1" , "2" , "4" ],
52
71
"val" : [1.0 , 2.1 , 4.0 ],
53
72
"se" : [np .nan , 0.21 , np .nan ],
54
- "sample_size" : [10.0 , 21.0 , 40.0 ]}),
73
+ "sample_size" : [10.0 , 21.0 , 40.0 ],
74
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
75
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
76
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
77
+ }),
55
78
56
79
# Added
57
80
"csv3" : pd .DataFrame ({
58
81
"geo_id" : ["2" ],
59
82
"val" : [2.0000002 ],
60
83
"se" : [0.2 ],
61
- "sample_size" : [20.0 ]}),
84
+ "sample_size" : [20.0 ],
85
+ "missing_val" : [Nans .NOT_MISSING ],
86
+ "missing_se" : [Nans .NOT_MISSING ],
87
+ "missing_sample_size" : [Nans .NOT_MISSING ],
88
+ }),
62
89
}
63
90
64
91
@@ -80,10 +107,14 @@ def test_diff_and_filter_exports(self, tmp_path):
80
107
mkdir (export_dir )
81
108
82
109
csv1_diff = pd .DataFrame ({
83
- "geo_id" : ["2" , "4" ],
84
- "val" : [2.1 , 4.0 ],
85
- "se" : [0.21 , np .nan ],
86
- "sample_size" : [21.0 , 40.0 ]})
110
+ "geo_id" : ["3" , "2" , "4" ],
111
+ "val" : [np .nan , 2.1 , 4.0 ],
112
+ "se" : [np .nan , 0.21 , np .nan ],
113
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
114
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
115
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
116
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
117
+ })
87
118
88
119
arch_diff = ArchiveDiffer (cache_dir , export_dir )
89
120
@@ -261,10 +292,14 @@ def test_run(self, tmp_path, s3_client):
261
292
# Check exports directory just has incremental changes
262
293
assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
263
294
csv1_diff = pd .DataFrame ({
264
- "geo_id" : ["2" , "4" ],
265
- "val" : [2.1 , 4.0 ],
266
- "se" : [0.21 , np .nan ],
267
- "sample_size" : [21.0 , 40.0 ]})
295
+ "geo_id" : ["3" , "2" , "4" ],
296
+ "val" : [np .nan , 2.1 , 4.0 ],
297
+ "se" : [np .nan , 0.21 , np .nan ],
298
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
299
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
300
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
301
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
302
+ })
268
303
assert_frame_equal (
269
304
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
270
305
csv1_diff )
@@ -346,7 +381,11 @@ def test_diff_exports(self, tmp_path):
346
381
"geo_id" : ["1" , "2" , "3" ],
347
382
"val" : [1.0 , 2.0 , 3.0 ],
348
383
"se" : [0.1 , 0.2 , 0.3 ],
349
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
384
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
385
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
386
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
387
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
388
+ })
350
389
351
390
# Write exact same CSV into cache and export, so no diffs expected
352
391
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -383,7 +422,11 @@ def test_archive_exports(self, tmp_path):
383
422
"geo_id" : ["1" , "2" , "3" ],
384
423
"val" : [1.0 , 2.0 , 3.0 ],
385
424
"se" : [0.1 , 0.2 , 0.3 ],
386
- "sample_size" : [10.0 , 20.0 , 30.0 ]})
425
+ "sample_size" : [10.0 , 20.0 , 30.0 ],
426
+ "missing_val" : [Nans .NOT_MISSING ] * 3 ,
427
+ "missing_se" : [Nans .NOT_MISSING ] * 3 ,
428
+ "missing_sample_size" : [Nans .NOT_MISSING ] * 3 ,
429
+ })
387
430
388
431
# csv1.csv is now a dirty edit in the repo, and to be exported too
389
432
csv1 .to_csv (join (cache_dir , "csv1.csv" ), index = False )
@@ -462,10 +505,14 @@ def test_run(self, tmp_path):
462
505
# Check exports directory just has incremental changes
463
506
assert set (listdir (export_dir )) == {"csv1.csv" , "csv3.csv" }
464
507
csv1_diff = pd .DataFrame ({
465
- "geo_id" : ["2" , "4" ],
466
- "val" : [2.1 , 4.0 ],
467
- "se" : [0.21 , np .nan ],
468
- "sample_size" : [21.0 , 40.0 ]})
508
+ "geo_id" : ["3" , "2" , "4" ],
509
+ "val" : [np .nan , 2.1 , 4.0 ],
510
+ "se" : [np .nan , 0.21 , np .nan ],
511
+ "sample_size" : [np .nan , 21.0 , 40.0 ],
512
+ "missing_val" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
513
+ "missing_se" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
514
+ "missing_sample_size" : [Nans .DELETED ] + [Nans .NOT_MISSING ] * 2 ,
515
+ })
469
516
assert_frame_equal (
470
517
pd .read_csv (join (export_dir , "csv1.csv" ), dtype = CSV_DTYPES ),
471
518
csv1_diff )
0 commit comments