Skip to content

Commit d0b8e85

Browse files
zcain117Borda
andauthored
integrate with CircleCI (#2486)
* add circleCI * wip * CircleCI setup that worked on my private repo. Use a working pytorch-lightning commit * Fix the orb imports * Update circleci header comment * Try to pull the GITHUB_REF from the CI_PULL_REQUEST * Use null instead of space for 'sed' * Add TODO for codecov * Remove echo of GKE_CLUSTER since it will be redacted by CircleCI. * Try running codecov upload. * Try using codecov orb * Use pip install codecov * Use codecov orb again since it should be approved * dockers/tpu-tests/Dockerfile * action * suggestions * drop suggestion * suggestion Co-authored-by: Jirka <[email protected]>
1 parent 1e68968 commit d0b8e85

File tree

3 files changed

+108
-47
lines changed

3 files changed

+108
-47
lines changed

.circleci/config.yml

+104-44
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,118 @@
1-
# Python CircleCI 2.0 configuration file
2-
#
3-
# Check https://circleci.com/docs/2.0/language-python/ for more details
4-
#
5-
version: 2.0
1+
# Python CircleCI 2.1 configuration file.
2+
version: 2.1
3+
orbs:
4+
gcp-gke: circleci/[email protected]
5+
go: circleci/[email protected]
6+
codecov: codecov/[email protected]
67

78
references:
89

9-
install_deps: &install_deps
10+
make_docs: &make_docs
1011
run:
11-
name: Install Dependences
12+
name: Make Documentation
1213
command: |
13-
sudo apt-get update && sudo apt-get install -y cmake
14-
pip install -r requirements/base.txt -q
15-
pip install -r requirements/test.txt -q
14+
# First run the same pipeline as Read-The-Docs
15+
# apt-get update && apt-get install -y cmake
16+
# using: https://hub.docker.com/r/readthedocs/build
17+
# we need to use py3.7 ot higher becase of an issue with metaclass inheritence
18+
pyenv global 3.7.3
19+
python --version
20+
pip install -r requirements/docs.txt
21+
cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
1622
17-
tests: &tests
23+
checkout_ml_testing: &checkout_ml_testing
1824
run:
19-
name: Testing
25+
name: Checkout ml-testing-accelerators
2026
command: |
21-
python --version ; pip --version ; pip list
22-
python -m pytest pytorch_lightning -v --junitxml=test-reports/pytest_junit.xml --ignore=pytorch_lightning/loggers/comet.py --ignore=pytorch_lightning/loggers/mlflow.py --ignore=pytorch_lightning/loggers/neptune.py --ignore=pytorch_lightning/loggers/test_tube.py --ignore=pytorch_lightning/loggers/wandb.py --ignore=pytorch_lightning/metrics/sklearns.py
23-
no_output_timeout: 15m
27+
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
28+
cd ml-testing-accelerators
29+
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
30+
git checkout stable
31+
cd ..
2432
25-
make_docs: &make_docs
33+
build_push_docker: &build_push_docker
34+
run:
35+
name: Build and push Docker image
36+
command: |
37+
gcloud --quiet auth configure-docker
38+
cd dockers/tpu-tests
39+
# TODO: How to find the GITHUB_REF in CircleCI?
40+
# $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11.
41+
# Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it
42+
# for the GITHUB_REF so Docker can pull the latest pending code in PR.
43+
git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head
44+
docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .
45+
#docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
46+
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
47+
48+
deploy_cluster: &deploy_cluster
2649
run:
27-
name: Make Documentation
50+
name: Deploy the job on the kubernetes cluster
2851
command: |
29-
# First run the same pipeline as Read-The-Docs
30-
# apt-get update && apt-get install -y cmake
31-
# using: https://hub.docker.com/r/readthedocs/build
32-
# we need to use py3.7 ot higher becase of an issue with metaclass inheritence
33-
pyenv global 3.7.3
34-
python --version
35-
pip install -r requirements/docs.txt
36-
cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
52+
go get github.com/google/go-jsonnet/cmd/jsonnet
53+
export PATH=$PATH:$HOME/go/bin
54+
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -)
55+
job_name=${job_name#job.batch/}
56+
job_name=${job_name% created}
57+
echo "Waiting on kubernetes job: $job_name"
58+
i=0 && \
59+
# N checks spaced 30s apart = 900s total.
60+
status_code=2 && \
61+
# Check on the job periodically. Set the status code depending on what
62+
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
63+
# still the job hasn't finished, give up and return the starting
64+
# non-zero status code.
65+
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
66+
echo "Done waiting. Job status code: $status_code" && \
67+
# Allow time for logs to flush.
68+
sleep 30 && \
69+
echo "JOB_NAME: $job_name" && \
70+
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
71+
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
72+
# First portion is the test logs. Print these to Github Action stdout.
73+
cat xx00 && \
74+
echo "Done with log retrieval attempt." && \
75+
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
76+
exit $status_code
77+
78+
stats: &stats
79+
run:
80+
name: Statistics
81+
command: |
82+
mv ./xx01 coverage.xml
83+
# TODO: add human readable report
84+
cat coverage.xml
85+
sudo pip install pycobertura
86+
pycobertura show coverage.xml
3787
3888
jobs:
3989

40-
Build-Docs:
90+
TPU-tests:
91+
docker:
92+
- image: circleci/python:3.7
93+
environment:
94+
- MAX_CHECKS: 60
95+
steps:
96+
- checkout
97+
- go/install
98+
- *checkout_ml_testing
99+
- gcp-gke/install
100+
- gcp-gke/update-kubeconfig-with-credentials:
101+
cluster: $GKE_CLUSTER
102+
perform-login: true
103+
- setup_remote_docker
104+
- *build_push_docker
105+
- *deploy_cluster
106+
- *stats
107+
- codecov/upload:
108+
file: coverage.xml
109+
flags: tpu,pytest
110+
upload_name: TPU-coverage
111+
112+
- store_artifacts:
113+
path: coverage.xml
114+
115+
build-Docs:
41116
docker:
42117
- image: readthedocs/build:latest
43118
steps:
@@ -48,24 +123,9 @@ jobs:
48123
path: docs/build/html/
49124
destination: html
50125

51-
CPU-Tests:
52-
# todo: to be replaced by TPU tests
53-
docker:
54-
- image: circleci/python:3.6
55-
environment:
56-
- TORCH_VERSION: "torch"
57-
steps: &steps
58-
- checkout
59-
- *install_deps
60-
- *tests
61-
- store_test_results:
62-
path: test-reports
63-
- store_artifacts:
64-
path: test-reports
65-
66126
workflows:
67127
version: 2
68-
build:
128+
tpu-tests:
69129
jobs:
70-
- Build-Docs
71-
- CPU-Tests
130+
- build-Docs
131+
- TPU-tests

.github/workflows/tpu-testing.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ on:
55
branches:
66
- master
77
# TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs
8-
pull_request:
9-
branches:
10-
- master
8+
# pull_request:
9+
# branches:
10+
# - master
1111

1212
env:
1313
PROJECT_ID: ${{ secrets.GKE_PROJECT }}

dockers/tpu-tests/Dockerfile

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ ARG TEST_IMAGE=0
1313
# Install pytorch-lightning at the current PR, plus dependencies.
1414
RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \
1515
cd pytorch-lightning && \
16+
echo $GITHUB_REF && \
1617
git fetch origin $GITHUB_REF:CI && \
1718
git checkout CI && \
1819
pip install --requirement ./requirements/base.txt --no-cache-dir

0 commit comments

Comments
 (0)