1
1
import os
2
2
os .environ ["CALITP_BQ_MAX_BYTES" ] = str (800_000_000_000 )
3
- from shared_utils import gtfs_utils_v2
4
3
4
+ from shared_utils import gtfs_utils_v2
5
5
from calitp_data_analysis .tables import tbls
6
+ from calitp_data_analysis .sql import query_sql
6
7
from siuba import *
7
8
import pandas as pd
8
9
import datetime as dt
9
-
10
10
import conveyal_vars
11
11
12
+ TARGET_DATE = conveyal_vars .TARGET_DATE
13
+ REGIONAL_SUBFEED_NAME = "Regional Subfeed"
14
+ INT_TO_GTFS_WEEKDAY = {
15
+ 0 : "monday" ,
16
+ 1 : "tuesday" ,
17
+ 2 : "wednesday" ,
18
+ 3 : "thursday" ,
19
+ 4 : "friday" ,
20
+ 5 : "saturday" ,
21
+ 6 : "sunday"
22
+ }
23
+
12
24
def check_defined_elsewhere (row , df ):
13
25
'''
14
26
for feeds without service defined, check if the same service is captured in another feed that does include service
@@ -17,8 +29,6 @@ def check_defined_elsewhere(row, df):
17
29
row ['service_any_feed' ] = is_defined
18
30
return row
19
31
20
- TARGET_DATE = conveyal_vars .TARGET_DATE
21
-
22
32
def get_feeds_check_service ():
23
33
feeds_on_target = gtfs_utils_v2 .schedule_daily_feed_to_gtfs_dataset_name (selected_date = TARGET_DATE )
24
34
feeds_on_target = feeds_on_target .rename (columns = {'name' :'gtfs_dataset_name' })
@@ -38,39 +48,153 @@ def get_feeds_check_service():
38
48
return feeds_on_target
39
49
40
50
def attach_transit_services (feeds_on_target : pd .DataFrame ):
41
-
51
+ """Associate each feed in feeds_on_target.gtfs_dataset_key with a transit service"""
42
52
target_dt = dt .datetime .combine (dt .date .fromisoformat (TARGET_DATE ), dt .time (0 ))
43
53
44
54
services = (tbls .mart_transit_database .dim_gtfs_service_data ()
45
- >> filter (_ ._valid_from <= target_dt , _ ._valid_to > target_dt )
55
+ >> filter (
56
+ _ ._valid_from <= target_dt , _ ._valid_to > target_dt
57
+ )
46
58
# >> filter(_.gtfs_dataset_key == 'da7e9e09d3eec6c7686adc21c8b28b63') # test with BCT
47
59
# >> filter(_.service_key == '5bc7371dca26d74a99be945b18b3174e')
48
- >> select (_ .service_key , _ .gtfs_dataset_key )
60
+ >> select (_ .service_key , _ .gtfs_dataset_key , _ . customer_facing )
49
61
>> collect ()
50
62
)
51
63
52
- feeds_on_target = feeds_on_target >> left_join (_ , services , on = 'gtfs_dataset_key' )
53
- return feeds_on_target
64
+ feeds_services_merged = feeds_on_target .merge (
65
+ services , how = "left" , on = 'gtfs_dataset_key' , validate = "one_to_many"
66
+ )
67
+ feeds_services_filtered = feeds_services_merged .loc [
68
+ feeds_services_merged ["customer_facing" ] | (feeds_services_merged ["regional_feed_type" ] == REGIONAL_SUBFEED_NAME )
69
+ ].copy ()
70
+ return feeds_services_filtered
54
71
55
- def report_undefined (feeds_on_target : pd .DataFrame ):
56
- fname = 'no_apparent_service.csv'
72
+ def get_undefined_feeds (feeds_on_target : pd .DataFrame ) -> pd . DataFrame :
73
+ """Return feeds in feeds_on_target that do not have service and where service is not defined in another feed"""
57
74
undefined = feeds_on_target .apply (check_defined_elsewhere , axis = 1 , args = [feeds_on_target ]) >> filter (- _ .service_any_feed )
75
+ return undefined
76
+
77
+ def report_unavailable_feeds (feeds : pd .DataFrame , fname : str ) -> None :
78
+ """Create a csv report of unavailable or backdated feeds at the paths specified in fname"""
79
+ undefined = feeds .loc [
80
+ feeds ["valid_date_other_than_service_date" ] | feeds ["no_schedule_feed_found" ]
81
+ ].copy ()
58
82
if undefined .empty :
59
83
print ('no undefined service feeds' )
60
84
else :
61
- print (undefined .columns )
62
85
print ('these feeds have no service defined on target date, nor are their services captured in other feeds:' )
63
- # gtfs_dataset_name no longer present, this whole script should probably be updated/replaced
64
- print (undefined >> select (_ .gtfs_dataset_name , _ .service_any_feed ))
86
+ print (undefined .loc [undefined ["no_schedule_feed_found" ], "gtfs_dataset_name" ].drop_duplicates ())
87
+ print ('these feeds have defined service, but only in a feed defined on a prior day' )
88
+ print (undefined .loc [undefined ["valid_date_other_than_service_date" ], "gtfs_dataset_name" ].drop_duplicates ())
65
89
print (f'saving detailed csv to { fname } ' )
66
- undefined .to_csv (fname )
67
- return
90
+ undefined .to_csv (fname , index = False )
91
+
92
+ ISO_DATE_ONLY_FORMAT = "%y-%m-%d"
93
+
94
+ def get_old_feeds (undefined_feeds_base64_urls : pd .Series , target_date : dt .date | dt .datetime , max_lookback_timedelta : dt .timedelta ) -> pd .Series :
95
+ """
96
+ Search the warehouse for feeds downloaded within the time before target_date
97
+ defined by max_lookback_timedelta that have service as defined in calendar.txt
98
+ on target_date. These feeds will not be valid on target_date, but will be accepted by Conveyal.
99
+ This should not be used if the feeds are valid on the target_date, since this will provide needlessly
100
+ invalid feeds. Note that this does not check calendar_dates.txt at present
101
+
102
+ Parameters:
103
+ undefined_feeds_base64_urls: a Pandas series containing base64 urls to feeds in the warehouse
104
+ target_date: a date or datetime where the feeds should be valid based on calendar.txt
105
+ max_lookback_timedelta: a timedelta defining the amount of time before target_date that a feed must have been available for
106
+
107
+ Returns:
108
+ A DataFrame with the following index and columns:
109
+ index: The base64 url of the feed, will match entries in undefined_feeds_base64_urls
110
+ feed_key: A key to dim_schedule_feeds matching the feed on the date it was last valid in the warehouse
111
+ date_processed: A datetime date matching the date on which the feed was last valid in the warehosue
112
+ """
113
+ base_64_urls_str = "('" + "', '" .join (undefined_feeds_base64_urls ) + "')"
114
+ day_of_the_week = INT_TO_GTFS_WEEKDAY [target_date .weekday ()]
115
+ max_lookback_date = target_date - max_lookback_timedelta
116
+ target_date_iso = target_date .strftime (ISO_DATE_ONLY_FORMAT )
117
+ # Query feeds for the newest feed where service is defined on the target_date,
118
+ # that have service on the day of the week of the target date, and
119
+ # that are valid before (inclusive) the target date and after (inclusive) the max look back date,
120
+ query = f"""
121
+ SELECT
122
+ `mart_gtfs.dim_schedule_feeds`.base64_url AS base64_url,
123
+ `mart_gtfs.dim_schedule_feeds`.key as feed_key,
124
+ MAX(`mart_gtfs.dim_schedule_feeds`._valid_to) AS valid_feed_date
125
+ from `mart_gtfs.dim_schedule_feeds`
126
+ LEFT JOIN `mart_gtfs.dim_calendar`
127
+ ON `mart_gtfs.dim_schedule_feeds`.key = `mart_gtfs.dim_calendar`.feed_key
128
+ WHERE `mart_gtfs.dim_schedule_feeds`.base64_url IN { base_64_urls_str }
129
+ AND `mart_gtfs.dim_schedule_feeds`._valid_to >= '{ max_lookback_date } '
130
+ AND `mart_gtfs.dim_schedule_feeds`._valid_to <= '{ target_date } '
131
+ AND `mart_gtfs.dim_calendar`.{ day_of_the_week } = 1
132
+ AND `mart_gtfs.dim_calendar`.start_date <= '{ target_date } '
133
+ AND `mart_gtfs.dim_calendar`.end_date >= '{ target_date } '
134
+ GROUP BY
135
+ `mart_gtfs.dim_schedule_feeds`.base64_url,
136
+ `mart_gtfs.dim_schedule_feeds`.key
137
+ LIMIT 1000
138
+ """
139
+ response = query_sql (
140
+ query
141
+ )
142
+ response_grouped = response .groupby ("base64_url" )
143
+ feed_info_by_url = response_grouped [["valid_feed_date" , "feed_key" ]].first ()
144
+ feed_info_by_url ["date_processed" ] = feed_info_by_url ["valid_feed_date" ].dt .date - dt .timedelta (days = 1 )
145
+ # we have the day the feed becomes invalid, so the day we are interested in where the feed *is* valid is the day after
146
+ return feed_info_by_url .drop ("valid_feed_date" , axis = 1 )
147
+
148
+ def merge_old_feeds (df_all_feeds : pd .DataFrame , df_undefined_feeds : pd .DataFrame , target_date : dt .date , max_lookback_timedelta : dt .timedelta ) -> pd .DataFrame :
149
+ """
150
+ Merge feeds from df_all_feeds with old feeds found as a result of calling get_old_feeds with df_undefined_feeds.base64_url
151
+
152
+ Params:
153
+ df_all_feeds: A DataFrame of feeds, must have feed_key, date, and base64_url as columns and must include the base64_urls in df_undefined_feeds
154
+ df_undefined_feeds: A DataFrame of feeds that are not valid on target_date, where an old feed should be searched for.
155
+ Must have base64_url as a column
156
+ target_date: a date or datetime where the feed should be valid based on its target date
157
+ max_lookback_timedelta: a timedelta defining the amount of time before target_date that a feed must have been available for
158
+
159
+ Returns:
160
+ A DataFrame identical to df_all_feeds except with the following columns changed or added:
161
+ feed_key: Updated for the found feeds
162
+ date: Updated for the found feeds:
163
+ no_schedule_feed_found: True if a schedule feed was present in df_undefined_feeds but was not associated with an older feed, otherwise false
164
+ valid_date_other_than_service_date: True if a new feed was found, otherwise false
165
+ """
166
+ feed_search_result = get_old_feeds (
167
+ df_undefined_feeds ["base64_url" ],
168
+ target_date ,
169
+ max_lookback_timedelta
170
+ )
171
+ feeds_merged = df_all_feeds .merge (
172
+ feed_search_result ,
173
+ how = "left" ,
174
+ left_on = "base64_url" ,
175
+ right_index = True ,
176
+ validate = "many_to_one"
177
+ )
178
+ feeds_merged ["feed_key" ] = feeds_merged ["feed_key_y" ].fillna (feeds_merged ["feed_key_x" ])
179
+ feeds_merged ["no_schedule_feed_found" ] = (
180
+ (feeds_merged ["base64_url" ].isin (df_undefined_feeds ["base64_url" ])) & (~ feeds_merged ["base64_url" ].isin (feed_search_result .index ))
181
+ ).fillna (False )
182
+ feeds_merged ["date" ] = feeds_merged ["date_processed" ].fillna (target_date )
183
+ feeds_merged ["valid_date_other_than_service_date" ] = feeds_merged ["date" ] != target_date
184
+
185
+ return feeds_merged .drop (
186
+ ["date_processed" , "feed_key_x" , "feed_key_y" ], axis = 1
187
+ )
68
188
69
189
if __name__ == '__main__' :
70
190
71
191
feeds_on_target = get_feeds_check_service ()
72
192
feeds_on_target = attach_transit_services (feeds_on_target )
73
193
print (f'feeds on target date shape: { feeds_on_target .shape } ' )
74
- report_undefined (feeds_on_target )
75
- feeds_on_target .to_parquet (f'{ conveyal_vars .GCS_PATH } feeds_{ TARGET_DATE } .parquet' )
194
+ undefined_feeds = get_undefined_feeds (feeds_on_target )
195
+ feeds_merged = merge_old_feeds (
196
+ feeds_on_target , undefined_feeds , dt .date .fromisoformat (TARGET_DATE ), conveyal_vars .LOOKBACK_TIME
197
+ )
198
+ report_unavailable_feeds (feeds_merged , 'no_apparent_service.csv' )
199
+ feeds_merged .to_parquet (f'{ conveyal_vars .GCS_PATH } feeds_{ TARGET_DATE } .parquet' )
76
200
0 commit comments