Skip to content

Commit fe296fd

Browse files
authored
Merge pull request #129 from Roestlab/ci
Continous Integration Tests
2 parents 64c6a60 + 81f78bf commit fe296fd

15 files changed

+278
-119
lines changed

.github/workflows/ci.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: continuous-integration
2+
3+
on: [push]
4+
5+
jobs:
6+
test:
7+
runs-on: ${{ matrix.os }}
8+
strategy:
9+
matrix:
10+
os: [ubuntu-latest]
11+
# Requirements file generated with python=3.11
12+
python-version: ["3.11"]
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Set up Python ${{ matrix.python-version }}
17+
uses: actions/setup-python@v5
18+
with:
19+
python-version: ${{ matrix.python-version }}
20+
- name: Install dependencies
21+
run: |
22+
python -m pip install --upgrade pip
23+
pip install -r requirements.txt # test with requirements file so can easily bump with dependabot
24+
pip install .
25+
26+
- name: Compile cython module
27+
run: python setup.py build_ext --inplace
28+
29+
- name: Test
30+
run: |
31+
python -m pytest tests/

.github/workflows/dependabot.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
version: 2
2+
updates:
3+
- package-ecosystem: "pip"
4+
directory: "/" # Location of your pyproject.toml or requirements.txt
5+
schedule:
6+
interval: "weekly" # Checks for updates every week
7+
commit-message:
8+
prefix: "deps" # Prefix for pull request titles
9+
open-pull-requests-limit: 5 # Limit the number of open PRs at a time

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ nosetests.xml
3636

3737
# vim
3838
*.sw[opqrs]
39+
*~

pyproject.toml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
[build-system]
2+
requires = ["setuptools", "wheel", "numpy", "cython"] # Dependencies needed to build the package
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "pyprophet"
7+
version = "2.2.9"
8+
description = "PyProphet: Semi-supervised learning and scoring of OpenSWATH results."
9+
readme = { file = "README.md", content-type = "text/markdown" }
10+
license = { text = "BSD" }
11+
authors = [{ name = "The PyProphet Developers", email = "[email protected]" }]
12+
classifiers = [
13+
"Development Status :: 3 - Alpha",
14+
"Environment :: Console",
15+
"Intended Audience :: Science/Research",
16+
"License :: OSI Approved :: BSD License",
17+
"Operating System :: OS Independent",
18+
"Topic :: Scientific/Engineering :: Bio-Informatics",
19+
"Topic :: Scientific/Engineering :: Chemistry"
20+
]
21+
keywords = ["bioinformatics", "openSWATH", "mass spectrometry"]
22+
23+
# Dependencies required for runtime
24+
dependencies = [
25+
"Click",
26+
"duckdb",
27+
"duckdb-extensions",
28+
"duckdb-extension-sqlite-scanner",
29+
"numpy >= 2.0",
30+
"scipy",
31+
"pandas >= 0.17",
32+
"cython",
33+
"numexpr >= 2.10.1",
34+
"scikit-learn >= 0.17",
35+
"xgboost",
36+
"hyperopt",
37+
"statsmodels >= 0.8.0",
38+
"matplotlib",
39+
"tabulate",
40+
"pyarrow",
41+
"pypdf"
42+
]
43+
44+
# Optional dependencies
45+
[project.optional-dependencies]
46+
testing = ["pytest", "pytest-regtest"]
47+
48+
# Define console entry points
49+
[project.scripts]
50+
pyprophet = "pyprophet.main:cli"
51+
52+
[tool.setuptools]
53+
packages = { find = { exclude = ["ez_setup", "examples", "tests"] } }
54+
include-package-data = true
55+
zip-safe = false

pyprophet/classifiers.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def objective(params):
110110

111111
clf = xgb.XGBClassifier(random_state=42, verbosity=0, objective='binary:logitraw', eval_metric='auc', **params)
112112

113-
score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=np.random.RandomState(42))).mean()
113+
score = cross_val_score(clf, X, y, scoring='roc_auc', n_jobs=self.threads, cv=KFold(n_splits=3, shuffle=True, random_state=42)).mean()
114114
# click.echo("Info: AUC: {:.3f} hyperparameters: {}".format(score, params))
115115
return score
116116

@@ -129,7 +129,8 @@ def objective(params):
129129
xgb_params_complexity = self.xgb_params_tuned
130130
xgb_params_complexity.update({k: self.xgb_params_space[k] for k in ('max_depth', 'min_child_weight')})
131131

132-
best_complexity = fmin(fn=objective, space=xgb_params_complexity, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
132+
rng = np.random.default_rng(42)
133+
best_complexity = fmin(fn=objective, space=xgb_params_complexity, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng)
133134
best_complexity['max_depth'] = int(best_complexity['max_depth'])
134135
best_complexity['min_child_weight'] = int(best_complexity['min_child_weight'])
135136

@@ -139,31 +140,31 @@ def objective(params):
139140
xgb_params_gamma = self.xgb_params_tuned
140141
xgb_params_gamma['gamma'] = self.xgb_params_space['gamma']
141142

142-
best_gamma = fmin(fn=objective, space=xgb_params_gamma, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
143+
best_gamma = fmin(fn=objective, space=xgb_params_gamma, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng)
143144

144145
self.xgb_params_tuned.update(best_gamma)
145146

146147
# Tune subsampling hyperparameters
147148
xgb_params_subsampling = self.xgb_params_tuned
148149
xgb_params_subsampling.update({k: self.xgb_params_space[k] for k in ('subsample', 'colsample_bytree', 'colsample_bylevel', 'colsample_bynode')})
149150

150-
best_subsampling = fmin(fn=objective, space=xgb_params_subsampling, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
151+
best_subsampling = fmin(fn=objective, space=xgb_params_subsampling, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng)
151152

152153
self.xgb_params_tuned.update(best_subsampling)
153154

154155
# Tune regularization hyperparameters
155156
xgb_params_regularization = self.xgb_params_tuned
156157
xgb_params_regularization.update({k: self.xgb_params_space[k] for k in ('lambda', 'alpha')})
157158

158-
best_regularization = fmin(fn=objective, space=xgb_params_regularization, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
159+
best_regularization = fmin(fn=objective, space=xgb_params_regularization, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng)
159160

160161
self.xgb_params_tuned.update(best_regularization)
161162

162163
# Tune learning rate
163164
xgb_params_learning = self.xgb_params_tuned
164165
xgb_params_learning['eta'] = self.xgb_params_space['eta']
165166

166-
best_learning = fmin(fn=objective, space=xgb_params_learning, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=np.random.RandomState(42))
167+
best_learning = fmin(fn=objective, space=xgb_params_learning, algo=tpe.suggest, max_evals=self.xgb_hyperparams['autotune_num_rounds'], rstate=rng)
167168

168169
self.xgb_params_tuned.update(best_learning)
169170
click.echo("Info: Optimal hyperparameters: {}".format(self.xgb_params_tuned))

pyprophet/data_handling.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66
import os
77
import multiprocessing
8+
from .stats import mean_and_std_dev
89

910
from .optimized import find_top_ranked, rank
1011

@@ -336,6 +337,21 @@ def get_top_target_peaks(self):
336337
def get_feature_matrix(self, use_main_score):
337338
min_col = 5 if use_main_score else 6
338339
return self.df.iloc[:, min_col:-1].values
340+
341+
def normalize_score_by_decoys(self, score_col_name):
342+
'''
343+
normalize the decoy scores to mean 0 and std 1, scale the targets accordingly
344+
Args:
345+
score_col_name: str, the name of the score column
346+
'''
347+
td_scores = self.get_top_decoy_peaks()[score_col_name]
348+
mu, nu = mean_and_std_dev(td_scores)
349+
350+
if nu == 0:
351+
raise Exception("Warning: Standard deviation of decoy scores is zero. Cannot normalize scores.")
352+
353+
self.df.loc[:, score_col_name] = (self.df[score_col_name] - mu) / nu
354+
339355

340356
def filter_(self, idx):
341357
return Experiment(self.df[idx])
@@ -344,7 +360,7 @@ def filter_(self, idx):
344360
def add_peak_group_rank(self):
345361
ids = self.df.tg_num_id.values
346362
scores = self.df.d_score.values
347-
peak_group_ranks = rank(ids, scores)
363+
peak_group_ranks = rank(ids, scores.astype(np.float32, copy=False))
348364
self.df["peak_group_rank"] = peak_group_ranks
349365

350366
@profile

pyprophet/export_parquet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def export_to_parquet(infile, outfile, transitionLevel, onlyFeatures=False):
172172

173173
# transition level
174174
if transitionLevel:
175-
columns['FEATURE_TRANSITION'] = ['AREA_INTENSITY', 'TOTAL_AREA_INTENSITY', 'APEX_INTENSITY', 'TOTAL_MI'] + getVarColumnNames(condb, 'FEATURE_TRANSITION')
175+
columns['FEATURE_TRANSITION'] = ['AREA_INTENSITY', 'TOTAL_AREA_INTENSITY', 'APEX_INTENSITY', 'TOTAL_MI'] + getVarColumnNames(con, 'FEATURE_TRANSITION')
176176
columns['TRANSITION'] = ['TRAML_ID', 'PRODUCT_MZ', 'CHARGE', 'TYPE', 'ORDINAL', 'DETECTING', 'IDENTIFYING', 'QUANTIFYING', 'LIBRARY_INTENSITY']
177177
columns['TRANSITION_PRECURSOR_MAPPING'] = ['TRANSITION_ID']
178178

pyprophet/levels_contexts.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,12 @@ def statistics_report(data, outfile, context, analyte, parametric, pfdr, pi0_lam
3333
outfile = outfile + "_" + str(data['run_id'].unique()[0])
3434

3535
# export PDF report
36-
save_report(outfile + "_" + context + "_" + analyte + ".pdf", outfile + ": " + context + " " + analyte + "-level error-rate control", data[data.decoy==1]["score"], data[data.decoy==0]["score"], stat_table["cutoff"], stat_table["svalue"], stat_table["qvalue"], data[data.decoy==0]["p_value"], pi0, color_palette)
36+
save_report(outfile + "_" + context + "_" + analyte + ".pdf",
37+
outfile + ": " + context + " " + analyte + "-level error-rate control",
38+
data[data.decoy==1]["score"].values, data[data.decoy==0]["score"].values, stat_table["cutoff"].values,
39+
stat_table["svalue"].values, stat_table["qvalue"].values, data[data.decoy==0]["p_value"].values,
40+
pi0,
41+
color_palette)
3742

3843
return(data)
3944

@@ -184,7 +189,7 @@ def infer_proteins(infile, outfile, context, parametric, pfdr, pi0_lambda, pi0_m
184189
con.close()
185190

186191
if context == 'run-specific':
187-
data = data.groupby('run_id').apply(statistics_report, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette).reset_index()
192+
data = data.groupby('run_id').apply(statistics_report, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette)
188193

189194
elif context in ['global', 'experiment-wide']:
190195
data = statistics_report(data, outfile, context, "protein", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette)
@@ -257,7 +262,7 @@ def infer_peptides(infile, outfile, context, parametric, pfdr, pi0_lambda, pi0_m
257262
con.close()
258263

259264
if context == 'run-specific':
260-
data = data.groupby('run_id').apply(statistics_report, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette).reset_index()
265+
data = data.groupby('run_id').apply(statistics_report, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette)
261266

262267
elif context in ['global', 'experiment-wide']:
263268
data = statistics_report(data, outfile, context, "peptide", parametric, pfdr, pi0_lambda, pi0_method, pi0_smooth_df, pi0_smooth_log_pi0, lfdr_truncate, lfdr_monotone, lfdr_transformation, lfdr_adj, lfdr_eps, color_palette)

pyprophet/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ def score(infile, outfile, classifier, xgb_autotune, apply_weights, xeval_fracti
106106
xgb_hyperparams = {'autotune': xgb_autotune, 'autotune_num_rounds': 10, 'num_boost_round': 100, 'early_stopping_rounds': 10, 'test_size': 0.33}
107107

108108
xgb_params = {'eta': 0.3, 'gamma': 0, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': 1, 'alpha': 0, 'scale_pos_weight': 1, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
109+
if test:
110+
xgb_params['tree_method'] = 'exact'
109111

110112
xgb_params_space = {'eta': hp.uniform('eta', 0.0, 0.3), 'gamma': hp.uniform('gamma', 0.0, 0.5), 'max_depth': hp.quniform('max_depth', 2, 8, 1), 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'lambda': hp.uniform('lambda', 0.0, 1.0), 'alpha': hp.uniform('alpha', 0.0, 1.0), 'scale_pos_weight': 1.0, 'verbosity': 0, 'objective': 'binary:logitraw', 'nthread': 1, 'eval_metric': 'auc'}
111113

pyprophet/semi_supervised.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from .data_handling import Experiment, update_chosen_main_score_in_table
55
from .classifiers import AbstractLearner, XGBLearner
6-
from .stats import mean_and_std_dev, find_cutoff
6+
from .stats import find_cutoff
77

88
try:
99
profile
@@ -64,13 +64,9 @@ def learn_randomized(self, experiment, score_columns, working_thread_number):
6464

6565
# after semi supervised iteration: classify full dataset
6666
clf_scores = self.score(experiment, params)
67-
mu, nu = mean_and_std_dev(clf_scores)
6867
experiment.set_and_rerank("classifier_score", clf_scores)
6968

70-
td_scores = experiment.get_top_decoy_peaks()["classifier_score"]
71-
72-
mu, nu = mean_and_std_dev(td_scores)
73-
experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu
69+
experiment.normalize_score_by_decoys('classifier_score')
7470
experiment.rank_by("classifier_score")
7571

7672
top_test_peaks = experiment.get_top_test_peaks()
@@ -92,13 +88,9 @@ def learn_final(self, experiment):
9288

9389
# after semi supervised iteration: classify full dataset
9490
clf_scores = self.score(experiment, params)
95-
mu, nu = mean_and_std_dev(clf_scores)
9691
experiment.set_and_rerank("classifier_score", clf_scores)
9792

98-
td_scores = experiment.get_top_decoy_peaks()["classifier_score"]
99-
100-
mu, nu = mean_and_std_dev(td_scores)
101-
experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu
93+
experiment.normalize_score_by_decoys('classifier_score')
10294
experiment.rank_by("classifier_score")
10395

10496
return params

0 commit comments

Comments
 (0)