src/acquisition/covidcast/database.py

"""A utility class that handles database operations related to covidcast.

See src/ddl/covidcast.sql for an explanation of each field.
"""
import threading
from math import ceil
from multiprocessing import cpu_count
from queue import Queue, Empty
from typing import List

# third party
import json
import mysql.connector

# first party
import delphi.operations.secrets as secrets
from delphi_utils import get_structured_logger
from delphi.epidata.common.covidcast_row import CovidcastRow


class DBLoadStateException(Exception):
  pass


class Database:
  """A collection of covidcast database operations."""

  DATABASE_NAME = 'covid'

  load_table = "epimetric_load"
  # if you want to deal with foreign key ids: use table
  # if you want to deal with source/signal names, geo type/values, etc: use view
  latest_table = "epimetric_latest"
  latest_view = latest_table + "_v"
  history_table = "epimetric_full"
  history_view = history_table + "_v"
  # TODO: consider using class variables like this for dimension table names too
  # TODO: also consider that for composite key tuples, like short_comp_key and long_comp_key as used in delete_batch()


  def connect(self, connector_impl=mysql.connector):
    """Establish a connection to the database."""

    u, p = secrets.db.epi
    self._connector_impl = connector_impl
    self._connection = self._connector_impl.connect(
        host=secrets.db.host,
        user=u,
        password=p,
        database=Database.DATABASE_NAME)
    self._cursor = self._connection.cursor()

  def commit(self):
    self._connection.commit()

  def rollback(self):
    self._connection.rollback()

  def disconnect(self, commit):
    """Close the database connection.

    commit: if true, commit changes, otherwise rollback
    """

    self._cursor.close()
    if commit:
      self._connection.commit()
    self._connection.close()


  def count_all_load_rows(self):
    self._cursor.execute(f'SELECT count(1) FROM `{self.load_table}`')
    for (num,) in self._cursor:
      return num

  def _reset_load_table_ai_counter(self):
    """Corrects the AUTO_INCREMENT counter in the load table.

    To be used in emergencies only, if the load table was accidentally TRUNCATEd.
    This ensures any `epimetric_id`s generated by the load table will not collide with the history or latest tables.
    This is also destructive to any data in the load table.
    """

    self._cursor.execute('DELETE FROM epimetric_load')
    # NOTE: 'ones' are used as filler here for the (required) NOT NULL columns.
    self._cursor.execute("""
      INSERT INTO epimetric_load
        (epimetric_id,
         source, `signal`, geo_type, geo_value, time_type, time_value, issue, `lag`, value_updated_timestamp)
      VALUES
        ((SELECT 1+MAX(epimetric_id) FROM epimetric_full),
         '1', '1', '1', '1', '1', 1, 1, 1, 1);""")
    self._cursor.execute('DELETE FROM epimetric_load')

  def do_analyze(self):
    """performs and stores key distribution analyses, used for join order and index selection"""
    # TODO: consider expanding this to update columns' histograms
    #       https://dev.mysql.com/doc/refman/8.0/en/analyze-table.html#analyze-table-histogram-statistics-analysis
    self._cursor.execute(
      f'''ANALYZE TABLE
          signal_dim, geo_dim,
          {self.load_table}, {self.history_table}, {self.latest_table}''')
    output = [self._cursor.column_names] + self._cursor.fetchall()
    get_structured_logger('do_analyze').info("ANALYZE results", results=str(output))

  def insert_or_update_bulk(self, cc_rows):
    return self.insert_or_update_batch(cc_rows)

  def insert_or_update_batch(self, cc_rows: List[CovidcastRow], batch_size=2**20, commit_partial=False, suppress_jobs=False):
    """
    Insert new rows into the load table and dispatch into dimension and fact tables.
    """

    if 0 != self.count_all_load_rows():
      err_msg = "Non-zero count in the load table!!!  This indicates a previous acquisition run may have failed, another acquisition is in progress, or this process does not otherwise have exclusive access to the db!"
      get_structured_logger("insert_or_update_batch").fatal(err_msg)
      raise DBLoadStateException(err_msg)

    # NOTE: `value_update_timestamp` is hardcoded to "NOW" (which is appropriate) and
    #       `is_latest_issue` is hardcoded to 1 (which is temporary and addressed later in this method)
    insert_into_loader_sql = f'''
      INSERT INTO `{self.load_table}`
        (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`,
        `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `issue`, `lag`,
        `is_latest_issue`, `missing_value`, `missing_stderr`, `missing_sample_size`)
      VALUES
        (%s, %s, %s, %s, %s, %s,
        UNIX_TIMESTAMP(NOW()), %s, %s, %s, %s, %s,
        1, %s, %s, %s)
    '''

    # all load table entries are already marked "is_latest_issue".
    # if an entry in the load table is NOT in the latest table, it is clearly now the latest value for that key (so we do nothing (thanks to INNER join)).
    # if an entry *IS* in both load and latest tables, but latest table issue is newer, unmark is_latest_issue in load.
    fix_is_latest_issue_sql = f'''
        UPDATE
            `{self.load_table}` JOIN `{self.latest_view}`
                USING (`source`, `signal`, `geo_type`, `geo_value`, `time_type`, `time_value`)
            SET `{self.load_table}`.`is_latest_issue`=0
            WHERE `{self.load_table}`.`issue` < `{self.latest_view}`.`issue`
    '''

    # TODO: consider handling cc_rows as a generator instead of a list

    try:
      num_rows = len(cc_rows)
      total = 0
      if not batch_size:
        batch_size = num_rows
      num_batches = ceil(num_rows/batch_size)
      for batch_num in range(num_batches):
        start = batch_num * batch_size
        end = min(num_rows, start + batch_size)

        args = [(
          row.source,
          row.signal,
          row.time_type,
          row.geo_type,
          row.time_value,
          row.geo_value,
          row.value,
          row.stderr,
          row.sample_size,
          row.issue,
          row.lag,
          row.missing_value,
          row.missing_stderr,
          row.missing_sample_size
        ) for row in cc_rows[start:end]]


        self._cursor.executemany(insert_into_loader_sql, args)
        modified_row_count = self._cursor.rowcount
        self._cursor.execute(fix_is_latest_issue_sql)
        if not suppress_jobs:
          self.run_dbjobs() # TODO: incorporate the logic of dbjobs() into this method [once calls to dbjobs() are no longer needed for migrations]

        if modified_row_count is None or modified_row_count == -1:
          # the SQL connector does not support returning number of rows affected (see PEP 249)
          total = None
        else:
          total += modified_row_count
        if commit_partial:
          self._connection.commit()
    except Exception as e:
      # rollback is handled in csv_to_database; if you're calling this yourself, handle your own rollback
      raise e
    return total

  def run_dbjobs(self):

    # we do this LEFT JOIN trick because mysql cant do set difference (aka EXCEPT or MINUS)
    # (as in " select distinct source, signal from signal_dim minus select distinct source, signal from epimetric_load ")
    signal_dim_add_new_load = f'''
        INSERT INTO signal_dim (`source`, `signal`)
            SELECT DISTINCT sl.source, sl.signal
                FROM {self.load_table} AS sl LEFT JOIN signal_dim AS sd
                USING (`source`, `signal`)
                WHERE sd.source IS NULL
    '''

    # again, same trick to get around lack of EXCEPT/MINUS
    geo_dim_add_new_load = f'''
        INSERT INTO geo_dim (`geo_type`, `geo_value`)
            SELECT DISTINCT sl.geo_type, sl.geo_value
                FROM {self.load_table} AS sl LEFT JOIN geo_dim AS gd
                USING (`geo_type`, `geo_value`)
                WHERE gd.geo_type IS NULL
    '''

    epimetric_full_load = f'''
        INSERT INTO {self.history_table}
            (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt,
             time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp,
             computation_as_of_dt, missing_value, missing_stderr, missing_sample_size)
        SELECT
            epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt,
                time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp,
                computation_as_of_dt, missing_value, missing_stderr, missing_sample_size
            FROM `{self.load_table}` sl
                INNER JOIN signal_dim sd USING (source, `signal`)
                INNER JOIN geo_dim gd USING (geo_type, geo_value)
        ON DUPLICATE KEY UPDATE
            `epimetric_id` = sl.`epimetric_id`,
            `value_updated_timestamp` = sl.`value_updated_timestamp`,
            `value` = sl.`value`,
            `stderr` = sl.`stderr`,
            `sample_size` = sl.`sample_size`,
            `lag` = sl.`lag`,
            `missing_value` = sl.`missing_value`,
            `missing_stderr` = sl.`missing_stderr`,
            `missing_sample_size` = sl.`missing_sample_size`
    '''

    epimetric_latest_load = f'''
        INSERT INTO {self.latest_table}
            (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt,
             time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp,
             computation_as_of_dt, missing_value, missing_stderr, missing_sample_size)
        SELECT
            epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt,
                time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp,
                computation_as_of_dt, missing_value, missing_stderr, missing_sample_size
            FROM `{self.load_table}` sl
                INNER JOIN signal_dim sd USING (source, `signal`)
                INNER JOIN geo_dim gd USING (geo_type, geo_value)
            WHERE is_latest_issue = 1
        ON DUPLICATE KEY UPDATE
            `epimetric_id` = sl.`epimetric_id`,
            `value_updated_timestamp` = sl.`value_updated_timestamp`,
            `value` = sl.`value`,
            `stderr` = sl.`stderr`,
            `sample_size` = sl.`sample_size`,
            `issue` = sl.`issue`,
            `lag` = sl.`lag`,
            `missing_value` = sl.`missing_value`,
            `missing_stderr` = sl.`missing_stderr`,
            `missing_sample_size` = sl.`missing_sample_size`
    '''

    # NOTE: DO NOT `TRUNCATE` THIS TABLE!  doing so will ruin the AUTO_INCREMENT counter that the history and latest tables depend on...
    epimetric_load_delete_processed = f'''
        DELETE FROM `{self.load_table}`
    '''

    logger = get_structured_logger("run_dbjobs")
    import time
    time_q = [time.time()]

    try:
      self._cursor.execute(signal_dim_add_new_load)
      time_q.append(time.time())
      logger.debug('signal_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2])

      self._cursor.execute(geo_dim_add_new_load)
      time_q.append(time.time())
      logger.debug('geo_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2])

      self._cursor.execute(epimetric_full_load)
      time_q.append(time.time())
      logger.debug('epimetric_full_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2])

      self._cursor.execute(epimetric_latest_load)
      time_q.append(time.time())
      logger.debug('epimetric_latest_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2])

      self._cursor.execute(epimetric_load_delete_processed)
      time_q.append(time.time())
      logger.debug('epimetric_load_delete_processed', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2])
    except Exception as e:
      raise e

    return self


  def delete_batch(self, cc_deletions):
    """
    Remove rows specified by a csv file or list of tuples.

    If cc_deletions is a filename, the file should include a header row and use the following field order:
    - geo_id
    - value (ignored)
    - stderr (ignored)
    - sample_size (ignored)
    - issue (YYYYMMDD format)
    - time_value (YYYYMMDD format)
    - geo_type
    - signal
    - source

    If cc_deletions is a list of tuples, the tuples should use the following field order (=same as above, plus time_type):
    - geo_id
    - value (ignored)
    - stderr (ignored)
    - sample_size (ignored)
    - issue (YYYYMMDD format)
    - time_value (YYYYMMDD format)
    - geo_type
    - signal
    - source
    - time_type
    """

    tmp_table_name = "tmp_delete_table"
    # composite keys:
    short_comp_key = "`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`"
    long_comp_key = short_comp_key + ", `issue`"

    create_tmp_table_sql = f'''
CREATE TABLE {tmp_table_name} LIKE {self.load_table};
'''

    amend_tmp_table_sql = f'''
ALTER TABLE {tmp_table_name} ADD COLUMN delete_history_id BIGINT UNSIGNED,
                             ADD COLUMN delete_latest_id BIGINT UNSIGNED,
                             ADD COLUMN update_latest BINARY(1) DEFAULT 0;
'''

    load_tmp_table_infile_sql = f'''
LOAD DATA INFILE "{cc_deletions}"
INTO TABLE {tmp_table_name}
FIELDS TERMINATED BY ","
IGNORE 1 LINES
(`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`)
SET time_type="day";
'''

    load_tmp_table_insert_sql = f'''
INSERT INTO {tmp_table_name}
(`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`, `time_type`,
`value_updated_timestamp`, `lag`, `is_latest_issue`)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
0, 0, 0)
'''

    add_history_id_sql = f'''
UPDATE {tmp_table_name} d INNER JOIN {self.history_view} h USING ({long_comp_key})
SET d.delete_history_id=h.epimetric_id;
'''

    # if a row we are deleting also appears in the 'latest' table (with a matching 'issue')...
    mark_for_update_latest_sql = f'''
UPDATE {tmp_table_name} d INNER JOIN {self.latest_view} ell USING ({long_comp_key})
SET d.update_latest=1, d.delete_latest_id=ell.epimetric_id;
'''

    delete_history_sql = f'''
DELETE h FROM {tmp_table_name} d INNER JOIN {self.history_table} h ON d.delete_history_id=h.epimetric_id;
'''

    # ...remove it from 'latest'...
    delete_latest_sql = f'''
DELETE ell FROM {tmp_table_name} d INNER JOIN {self.latest_table} ell ON d.delete_latest_id=ell.epimetric_id;
'''

    # ...and re-write that record with its next-latest issue (from 'history') instead.
    # NOTE: this must be executed *AFTER* `delete_history_sql` to ensure we get the correct `issue`
    #       AND also after `delete_latest_sql` so that we dont get a key collision on insert.
    update_latest_sql = f'''
INSERT INTO {self.latest_table}
  (epimetric_id,
  signal_key_id, geo_key_id, time_type, time_value, issue,
  value, stderr, sample_size, `lag`, value_updated_timestamp,
  missing_value, missing_stderr, missing_sample_size)
SELECT
  h.epimetric_id,
  h.signal_key_id, h.geo_key_id, h.time_type, h.time_value, h.issue,
  h.value, h.stderr, h.sample_size, h.`lag`, h.value_updated_timestamp,
  h.missing_value, h.missing_stderr, h.missing_sample_size
FROM {self.history_view} h JOIN (
    SELECT {short_comp_key}, MAX(hh.issue) AS issue
    FROM {self.history_view} hh JOIN {tmp_table_name} dd USING ({short_comp_key})
    WHERE dd.update_latest=1 GROUP BY {short_comp_key}
  ) d USING ({long_comp_key});
'''

    drop_tmp_table_sql = f'DROP TABLE IF EXISTS {tmp_table_name}'

    total = None
    try:
      self._cursor.execute(drop_tmp_table_sql)
      self._cursor.execute(create_tmp_table_sql)
      self._cursor.execute(amend_tmp_table_sql)
      if isinstance(cc_deletions, str):
        self._cursor.execute(load_tmp_table_infile_sql)
      elif isinstance(cc_deletions, list):
        def split_list(lst, n):
          for i in range(0, len(lst), n):
            yield lst[i:(i+n)]
        for deletions_batch in split_list(cc_deletions, 100000):
          self._cursor.executemany(load_tmp_table_insert_sql, deletions_batch)
          print(f"load_tmp_table_insert_sql:{self._cursor.rowcount}")
      else:
        raise Exception(f"Bad deletions argument: need a filename or a list of tuples; got a {type(cc_deletions)}")
      self._cursor.execute(add_history_id_sql)
      print(f"add_history_id_sql:{self._cursor.rowcount}")
      self._cursor.execute(mark_for_update_latest_sql)
      print(f"mark_for_update_latest_sql:{self._cursor.rowcount}")
      self._cursor.execute(delete_history_sql)
      print(f"delete_history_sql:{self._cursor.rowcount}")
      total = self._cursor.rowcount
      # TODO: consider reporting rows removed and/or replaced in latest table as well
      self._cursor.execute(delete_latest_sql)
      print(f"delete_latest_sql:{self._cursor.rowcount}")
      self._cursor.execute(update_latest_sql)
      print(f"update_latest_sql:{self._cursor.rowcount}")
      self._connection.commit()

      if total == -1:
        # the SQL connector does not support returning number of rows affected (see PEP 249)
        total = None
    except Exception as e:
      raise e
    finally:
      self._cursor.execute(drop_tmp_table_sql)
    return total


  def compute_covidcast_meta(self, table_name=None, n_threads=None):
    """Compute and return metadata on all COVIDcast signals."""
    logger = get_structured_logger("compute_covidcast_meta")

    if table_name is None:
      table_name = self.latest_view

    if n_threads is None:
      logger.info("n_threads unspecified, automatically choosing based on number of detected cores...")
      n_threads = max(1, cpu_count()*9//10) # aka number of concurrent db connections, which [sh|c]ould be ~<= 90% of the #cores available to SQL server
      # NOTE: this may present a small problem if this job runs on different hardware than the db,
      #       which is why this value can be overriden by optional argument.
    logger.info(f"using {n_threads} workers")

    srcsigs = Queue() # multi-consumer threadsafe!
    sql = f'SELECT `source`, `signal` FROM `{table_name}` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;'
    self._cursor.execute(sql)
    for source, signal in self._cursor:
      srcsigs.put((source, signal))

    inner_sql = f'''
      SELECT
        `source` AS `data_source`,
        `signal`,
        `time_type`,
        `geo_type`,
        MIN(`time_value`) AS `min_time`,
        MAX(`time_value`) AS `max_time`,
        COUNT(DISTINCT `geo_value`) AS `num_locations`,
        MIN(`value`) AS `min_value`,
        MAX(`value`) AS `max_value`,
        ROUND(AVG(`value`),7) AS `mean_value`,
        ROUND(STD(`value`),7) AS `stdev_value`,
        MAX(`value_updated_timestamp`) AS `last_update`,
        MAX(`issue`) as `max_issue`,
        MIN(`lag`) as `min_lag`,
        MAX(`lag`) as `max_lag`
      FROM
        `{table_name}`
      WHERE
        `source` = %s AND
        `signal` = %s
      GROUP BY
        `time_type`,
        `geo_type`
      ORDER BY
        `time_type` ASC,
        `geo_type` ASC
      '''

    meta = []
    meta_lock = threading.Lock()

    def worker():
      name = threading.current_thread().name
      logger.info("starting thread", thread=name)
      #  set up new db connection for thread
      worker_dbc = Database()
      worker_dbc.connect(connector_impl=self._connector_impl)
      w_cursor = worker_dbc._cursor
      try:
        while True:
          (source, signal) = srcsigs.get_nowait() # this will throw the Empty caught below
          logger.info("starting pair", thread=name, pair=f"({source}, {signal})")
          w_cursor.execute(inner_sql, (source, signal))
          with meta_lock:
            meta.extend(list(
              dict(zip(w_cursor.column_names, x)) for x in w_cursor
            ))
          srcsigs.task_done()
      except Empty:
        logger.info("no jobs left, thread terminating", thread=name)
      finally:
        worker_dbc.disconnect(False) # cleanup

    threads = []
    for n in range(n_threads):
      t = threading.Thread(target=worker, name='MetacacheThread-'+str(n))
      t.start()
      threads.append(t)

    srcsigs.join()
    logger.info("jobs complete")
    for t in threads:
      t.join()
    logger.info("all threads terminated")

    meta = sorted(meta, key=lambda x: (x['data_source'], x['signal'], x['time_type'], x['geo_type']))

    return meta


  def update_covidcast_meta_cache(self, metadata):
    """Updates the `covidcast_meta_cache` table."""

    sql = '''
      UPDATE
        `covidcast_meta_cache`
      SET
        `timestamp` = UNIX_TIMESTAMP(NOW()),
        `epidata` = %s
    '''
    epidata_json = json.dumps(metadata)

    self._cursor.execute(sql, (epidata_json,))

  def retrieve_covidcast_meta_cache(self):
    """Useful for viewing cache entries (was used in debugging)"""

    sql = '''
      SELECT `epidata`
      FROM `covidcast_meta_cache`
      ORDER BY `timestamp` DESC
      LIMIT 1;
    '''
    self._cursor.execute(sql)
    cache_json = self._cursor.fetchone()[0]
    cache = json.loads(cache_json)
    cache_hash = {}
    for entry in cache:
      cache_hash[(entry['data_source'], entry['signal'], entry['time_type'], entry['geo_type'])] = entry
    return cache_hash

  def compute_coverage_crossref(self):
    """Compute coverage_crossref table, for looking up available signals per geo or vice versa."""

    logger = get_structured_logger("compute_coverage_crossref")

    coverage_crossref_delete_sql = '''
      DELETE FROM coverage_crossref;
    '''

    coverage_crossref_update_sql = '''
      INSERT INTO coverage_crossref (signal_key_id, geo_key_id, min_time_value, max_time_value)
      SELECT
          signal_key_id,
          geo_key_id,
          MIN(time_value) AS min_time_value,
          MAX(time_value) AS max_time_value
      FROM covid.epimetric_latest
      GROUP BY signal_key_id, geo_key_id;
    '''

    self._connection.start_transaction()

    self._cursor.execute(coverage_crossref_delete_sql)
    logger.info("coverage_crossref_delete", rows=self._cursor.rowcount)

    self._cursor.execute(coverage_crossref_update_sql)
    logger.info("coverage_crossref_update", rows=self._cursor.rowcount)

    self.commit()
    logger.info("coverage_crossref committed")

    return self._cursor.rowcount