Skip to content

Feature: Add sqlite3 support with SQLModel #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2cd495f
Add clone, referring, paths, and traffic sqlmodels
astrochun Jun 4, 2022
a388234
Add db module to support sqlite #89
astrochun Jun 24, 2022
bd0e949
Add migrate_to_sqlite script
astrochun Jun 24, 2022
5c001ac
stats_plots: Load data from sqlite
astrochun Jun 26, 2022
ee9f7f6
gts_run_all_repos: Migrate new CSV to sqlite
astrochun Jun 26, 2022
cab1d1a
Adjust model fields to be consistent with GitHub UI terms
astrochun Jul 3, 2022
53f3e65
Change to editable pip install in python-package.yml
astrochun Jul 3, 2022
312c017
Add unit tests for db module
astrochun Jul 4, 2022
f57a351
db: Add query_path function to handle Paths data query
astrochun Jul 12, 2022
f507417
Add additional verbose messaging in scripts
astrochun Jul 13, 2022
29f8caa
Update gts_run_all_repos to include paths and referrer files
astrochun Jul 21, 2022
fa537ad
migrate_csv: Sort by repository name and date
astrochun Jul 24, 2022
a30619e
Merge remote-tracking branch 'origin/main' into feature/89_sqlite
astrochun Jul 31, 2022
e8613dc
Fix a few things for consistency
astrochun Jul 31, 2022
be1ffeb
Minor fix for poort gts column names
astrochun Jul 31, 2022
674cf57
Fix columns for top paths output
astrochun Jul 31, 2022
d927e37
Ensure that individual runs are added
astrochun Aug 1, 2022
3a85863
migrate_csv: Adjust sorting of dataframe
astrochun Aug 2, 2022
ec1abc6
Minor fix [ci skip]
astrochun Aug 2, 2022
a9fc8e7
Debug messaging [ci skip]
astrochun Aug 2, 2022
5d0f280
Handle existing new columns for merged paths CSV
astrochun Aug 2, 2022
6426597
migrate_csv: Fix empty path handling (treat as string and not NaN)
astrochun Aug 7, 2022
ea6a21f
Fix missing header for merged referrer file
astrochun Aug 7, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install github_stats_pages
run: |
pip install .[test]
pip install -e .[test]
- name: Test with pytest
run: |
echo "Username for unit tests : ${{ github.actor }}"
Expand Down
7 changes: 7 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from github_stats_pages import db


def pytest_addoption(parser):
parser.addoption("--username", action="store", default="GitHub username")
Expand All @@ -22,3 +24,8 @@ def token(request):
if name_value is None:
pytest.skip()
return name_value


@pytest.fixture(scope="session")
def test_engine():
return db.create_db_and_tables(test=True)
2 changes: 2 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ else
test="--test"
fi

migrate_to_sqlite

get_repo_list -u $1

gts_run_all_repos -u $1 -t $2 -c "$1".csv ${test}
Expand Down
5 changes: 5 additions & 0 deletions github_stats_pages/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
__version__ = "0.4.14"

RENAME_MAPPING = {
"count": "views", # for paths
"unique_visitors/cloners": "unique", # for clones, traffic, referrer
"uniques": "unique", # for paths
}

STATS_TYPES = ["clone", "paths", "referrer", "traffic"]

Expand Down
168 changes: 168 additions & 0 deletions github_stats_pages/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from functools import partial
from pathlib import Path
from typing import List, Type, Union

import pandas as pd
from sqlalchemy.future import Engine
from sqlalchemy.exc import NoResultFound
from sqlmodel import SQLModel, Session, create_engine, select

from .models import Clone, Referrer, Traffic, Paths
from .logger import app_log as log
from . import RENAME_MAPPING, STATS_SORT_DATAFRAME

SQLITE_FILE_NAME = Path("data/sqlite3.db")


def configure(test: bool = False, echo: bool = False) -> Engine:
sqlite_file_name = (
Path("tests_data/sqlite3.db") if test else SQLITE_FILE_NAME
)
if not sqlite_file_name.parent.exists(): # pragma: no cover
sqlite_file_name.parent.mkdir()
sqlite_url = f"sqlite:///{sqlite_file_name}"
log.info(f"Configuring SQLite at: {sqlite_url}")
return create_engine(sqlite_url, echo=echo)


def create_db_and_tables(test: bool = False, echo: bool = False):
engine = configure(test=test, echo=echo)
SQLModel.metadata.create_all(engine)
return engine


def migrate_csv(
filename: Path,
model: Type[SQLModel],
engine: Engine,
):
"""Migrate CSV over to SQLite"""

log.info(f"[yellow]Loading: {filename}")
df = pd.read_csv(filename, na_filter=False)
df.rename(columns=RENAME_MAPPING, inplace=True)
log.info(f"Size of dataframe: {len(df)}")
log.info(f"columns: {df.columns}")
if "merge" not in filename.name:
if model.__name__ == "Referrer": # Add date since this isn't included
file_date = filename.name[:10]
df.insert(loc=0, column="date", value=file_date)

if model.__name__ == "Paths":
if "repository_name" not in df.columns:
repository_names = [a.split("/")[2] for a in df["path"].values]
df.insert(1, "repository_name", repository_names)
simple_paths = [
"/".join(a.split("/")[3:]) for a in df["path"].values
]
df["path"] = simple_paths
else:
log.info(
f"{filename} already updated with repository_name and path"
)

sort_columns = STATS_SORT_DATAFRAME[model.__name__.lower()]
log.info(f"sort_columns: {sort_columns}")
df.sort_values(by=sort_columns, inplace=True)

if model.__name__ == "Paths":
func = partial(query_path, engine=engine, model=model)
query_results = list(
map(func, df["repository_name"], df["date"], df["path"])
)
elif model.__name__ == "Referrer":
func = partial(query_referrer, engine=engine, model=model)
query_results = list(
map(func, df["repository_name"], df["date"], df["site"])
)
else: # For Clone and Traffic
func = partial(query, engine=engine, model=model)
query_results = list(map(func, df["repository_name"], df["date"]))

new_df: pd.DataFrame = df.iloc[
[idx for idx, item in enumerate(query_results) if not item]
]
if new_df.empty:
log.info("No new records!")
else:
log.info(f"New records found: {len(new_df)}")
log.info("[bold yellow]Adding data")
new_df.to_sql(
model.__name__.lower(), engine, if_exists="append", index=False
)
if len(new_df) < len(df): # pragma: no cover
log.info("[orange]Some records exists in db")


def query(
repository_name: str,
date: str,
engine: Engine,
model: Union[Type[SQLModel], Clone, Referrer, Paths, Traffic],
) -> Union[SQLModel, Clone, Referrer, Paths, Traffic, None]:

with Session(engine) as session:
result = session.exec(
select(model).where(
model.repository_name == repository_name, model.date == date
)
)
try:
return result.one()
except NoResultFound:
return


def query_all(
engine: Engine,
model: Union[Type[SQLModel], Clone, Referrer, Paths, Traffic],
) -> List[Union[SQLModel, Clone, Referrer, Paths, Traffic]]:
"""Retrieve an entire table"""

with Session(engine) as session:
result = session.exec(select(model))
return result.all()


def query_path(
repository_name: str,
date: str,
path: str,
engine: Engine,
model: Union[Type[SQLModel], Paths],
) -> Union[SQLModel, Paths, None]:

with Session(engine) as session:
result = session.exec(
select(model).where(
model.repository_name == repository_name,
model.date == date,
model.path == path,
)
)
try:
return result.one()
except NoResultFound:
return


def query_referrer(
repository_name: str,
date: str,
site: str,
engine: Engine,
model: Union[Type[SQLModel], Referrer],
) -> Union[SQLModel, Referrer, None]:

with Session(engine) as session:
result = session.exec(
select(model).where(
model.repository_name == repository_name,
model.date == date,
model.site == site,
)
)
try:
return result.one()
except NoResultFound:
return
4 changes: 4 additions & 0 deletions github_stats_pages/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .clone import Clone # noqa: F401
from .referrer import Referrer # noqa: F401
from .paths import Paths # noqa: F401
from .traffic import Traffic # noqa: F401
11 changes: 11 additions & 0 deletions github_stats_pages/models/clone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import Optional

from sqlmodel import SQLModel, Field


class Clone(SQLModel, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
repository_name: str
date: str
clones: int
unique: int
13 changes: 13 additions & 0 deletions github_stats_pages/models/paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import Optional

from sqlmodel import SQLModel, Field


class Paths(SQLModel, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
date: str
repository_name: Optional[str]
path: str
title: str
views: int
unique: int
12 changes: 12 additions & 0 deletions github_stats_pages/models/referrer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import Optional

from sqlmodel import SQLModel, Field


class Referrer(SQLModel, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
repository_name: str
site: str
date: Optional[str]
views: int
unique: int
11 changes: 11 additions & 0 deletions github_stats_pages/models/traffic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import Optional

from sqlmodel import SQLModel, Field


class Traffic(SQLModel, table=True):
id: Optional[int] = Field(default=None, primary_key=True)
repository_name: str
date: str
views: int
unique: int
40 changes: 29 additions & 11 deletions github_stats_pages/stats_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@
import pandas as pd

from .logger import app_log as log
from . import db
from .models import Clone, Traffic

prefix = "merged"
stats_type = ["traffic", "clone"]
c_columns = ["repository_name", "date", "total", "unique"]
r_columns = ["repository_name", "date", "source", "total", "unique"]
t_columns = ["repository_name", "date", "views", "unique"]

TOOLTIPS = [
("index", "$index"),
Expand All @@ -27,21 +32,34 @@
main_p = Path(__file__).parent


def load_data(data_dir: str) -> Dict[str, pd.DataFrame]:
def load_data(
test: bool = False, engine: Optional[db.Engine] = None
) -> Dict[str, pd.DataFrame]:
"""
Load stats CSV as dict of pandas DataFrame

:param data_dir: Path containing merged*.csv
:return: Dict of pandas DataFrame
"""

p = Path(data_dir) / "data"
if not engine:
engine = db.create_db_and_tables(test=test)

dict_df = {}

for stats in stats_type:
stat_file = p / f"{prefix}_{stats}.csv"
dict_df[stats] = pd.read_csv(stat_file)
for stats, m in zip(stats_type, [Traffic, Clone]):
records = [i.dict() for i in db.query_all(engine, m)]
if records:
dict_df[stats] = pd.DataFrame.from_records(records, index="id")
else:
log.warning(f"[bold red]No data in {stats} table!")
names = []
if stats == "clone":
names = c_columns
elif stats == "traffic":
names = t_columns
elif stats == "referrer":
names = r_columns
dict_df[stats] = pd.DataFrame(columns=names)

return dict_df

Expand All @@ -64,8 +82,8 @@ def get_date_range(df_list: List[pd.DataFrame]) -> Optional[Tuple[dt, dt]]:

if len(x_min) > 0:
return min(x_min) - td(days=1), max(x_max) + td(days=1)
else:
return None
else: # pragma: no cover
return


def date_subplots(
Expand Down Expand Up @@ -211,19 +229,18 @@ def user_readme(username: str, token: str = None) -> str:

def make_plots(
username: str,
data_dir: str,
out_dir: str,
csv_file: str,
symlink: bool = False,
token: str = "",
include_repos: str = "",
exclude_repos: str = "",
test: bool = False,
):
"""
Generate HTML pages containing Bokeh plots

:param username: GitHub username or organization
:param data_dir: Path to working folder. CSVs are under a 'data' sub-folder
:param out_dir: Location of outputted HTML
:param csv_file: CSV file containing user or organization repository list
:param symlink: Symbolic link styles assets instead of copy. Default: copy
Expand All @@ -232,6 +249,7 @@ def make_plots(
Ignore csv_file inputs. Comma separated for multiples
:param exclude_repos: Repositories to exclude from csv_file list.
Comma separated for more than one
:param test: For CI testing
"""

if include_repos and exclude_repos:
Expand All @@ -244,7 +262,7 @@ def make_plots(
(~repository_df["fork"]) & (~repository_df["archived"])
]

dict_df = load_data(data_dir)
dict_df = load_data(test=test)

# Add repo folder for all static repo pages
p_repos = Path(out_dir) / "repos"
Expand Down
Loading