Skip to content

Commit 84bd063

Browse files
authored
Take heapdump and coredumps on low memory (#2430)
* NONE: admin endpoint * NONE: debug * NONE: add gdb, fix build * NONE: fix Dockerfile * NONE: fix build * NONE: fix build * some fixes * NONE: couple of fixes * NONE: add uploading to s3 * NONE: fix s3 uploading * NONE: better logging * NONE: tests * NONE: add heapdump generator, too * NONE: self-review * NONE: fix tests * NONE: missing labels * NONE: comment * NONE: fix tests
1 parent b4eceab commit 84bd063

23 files changed

+661
-27
lines changed

.env.test

+4
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,7 @@ MICROS_PLATFORM_STATSD_HOST=localhost
6262
MICROS_PLATFORM_STATSD_PORT=8125
6363

6464
JIRA_TEST_SITES=https://site-1.some-test.atlassian.net,https://site-2.some-test.atlassian.net,https://site-3.some-test.atlassian.net
65+
66+
S3_DUMPS_BUCKET_NAME=my-bucket
67+
S3_DUMPS_BUCKET_PATH=my/path
68+
S3_DUMPS_BUCKET_REGION=my-region

Dockerfile

+4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ FROM node:14.21-alpine3.16 as build
33
# adding python for node-gyp
44
RUN apk add g++ make python3
55

6+
# For coredumps
7+
RUN apk add gdb
8+
RUN apk add bash
9+
610
# adding to solve vuln
711
RUN apk add --update --upgrade busybox libretls openssl zlib curl
812

Dockerfile.prod

+8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ FROM node:14.21-alpine3.16 as build
33
# adding python for node-gyp
44
RUN apk add g++ make python3
55

6+
# For coredumps
7+
RUN apk add gdb
8+
RUN apk add bash
9+
610
# adding to solve vuln
711
RUN apk add --update --upgrade busybox
812
RUN apk add --update --upgrade libretls
@@ -31,6 +35,10 @@ RUN apk add --update --upgrade zlib
3135
# For debugging curl command
3236
RUN apk add curl
3337

38+
# For coredumps
39+
RUN apk add gdb
40+
RUN apk add bash
41+
3442
USER node
3543
COPY --chown=node:node --from=build /app /app
3644
# Add this the service Dockerfile, at the final stage if multi-stage

bin/start-server-micros.sh

+8-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ case "$MICROS_GROUP" in
1515
;;
1616
esac
1717

18-
export DATABASE_URL=postgres://$PG_DATABASE_ROLE:$PG_DATABASE_PASSWORD@$PG_DATABASE_BOUNCER:$PG_DATABASE_PORT/$PG_DATABASE_SCHEMA
18+
case "$MICROS_ENVTYPE" in
19+
"dev")
20+
export NODE_OPTIONS="--max-old-space-size=250" # since ddev nodes have smaller memory available in general
21+
;;
22+
esac
1923

24+
export DATABASE_URL=postgres://$PG_DATABASE_ROLE:$PG_DATABASE_PASSWORD@$PG_DATABASE_BOUNCER:$PG_DATABASE_PORT/$PG_DATABASE_SCHEMA
25+
echo "We are at:"
26+
pwd
2027
npm run "${COMMAND}"

docker-compose.yml

+1
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,4 @@ services:
8989
dockerfile: etc/app-install/Dockerfile
9090
env_file:
9191
- .env
92+

github-for-jira.sd.yml

+11
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,17 @@ resources:
158158
# Name of the RDS resource from above
159159
resource: rds
160160

161+
- type: s3
162+
name: "dumps" # for heapdumps, temp resource, will be deleted after the investigation is over
163+
attributes:
164+
dataType:
165+
- UGC/Label # name of GitHub org / Jira site
166+
- PII/IndirectConfidential # name of GitHub org
167+
- UGC/Configuration # data about the installation of the GitHub app into Jira sites
168+
- Security/Secret # shared Connect secret
169+
- UGC/PrimaryIdentifier # references to GitHub entities (commits, pull requests, etc.) and Jira issues
170+
- UGC/Primary # GitHub entities (non-persistent) like commits, pull requests, etc.
171+
161172
- name: deployment-history-cache
162173
type: dynamo-db
163174
attributes: &table-attributes

package.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
"start:main": "tsnd -r tsconfig-paths/register --watch=.env*,db/config.json --inspect=0.0.0.0:9229 --respawn --transpile-only -- src/main.ts",
1919
"start:worker": "tsnd -r tsconfig-paths/register --watch=.env*,db/config.json --inspect=0.0.0.0:9230 --respawn --transpile-only -- src/worker.ts",
2020
"start:production": "run-p start:main:production start:worker:production",
21-
"start:main:production": "node -r tsconfig-paths/register src/main",
22-
"start:worker:production": "node -r tsconfig-paths/register src/worker",
21+
"start:main:production": "TS_NODE_BASEURL=\"build\" node --expose-gc -r tsconfig-paths/register build/src/main",
22+
"start:worker:production": "TS_NODE_BASEURL=\"build\" node --expose-gc -r tsconfig-paths/register build/src/worker",
2323
"setup": "dotenv -- ts-node -r tsconfig-paths/register prestart.ts",
2424
"build": "tsc -p ./tsconfig.json && yarn spa:build",
2525
"build:release": "tsc -p ./tsconfig.release.json && yarn spa:build",
@@ -77,6 +77,7 @@
7777
"date-fns": "^1.29.0",
7878
"dotenv": "^16.0.1",
7979
"dotenv-expand": "^8.0.3",
80+
"dumpme": "^1.0.3",
8081
"event-loop-lag": "^1.4.0",
8182
"express": "^4.17.3",
8283
"express-async-handler": "^1.2.0",

src/config/env.ts

+8-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const nodeEnv: EnvironmentEnum = EnvironmentEnum[getNodeEnv()];
1515
`.env.${nodeEnv}`,
1616
".env"
1717
].map((env) => expand(config({
18-
path: path.resolve(__dirname, "../..", env)
18+
path: path.resolve(process.cwd(), env)
1919
})));
2020

2121
type Transforms<T, K extends keyof T = keyof T> = {
@@ -26,6 +26,9 @@ const transforms: Transforms<EnvVars> = {
2626
MICROS_ENV: (value?: string) => EnvironmentEnum[value || EnvironmentEnum.development],
2727
MICROS_GROUP: (value?: string) => value || "",
2828
NODE_ENV: () => nodeEnv,
29+
S3_DUMPS_BUCKET_NAME: (value?: string) => value ?? "",
30+
S3_DUMPS_BUCKET_PATH: (value?: string) => value ?? "",
31+
S3_DUMPS_BUCKET_REGION: (value?: string) => value ?? "",
2932
PROXY: () => {
3033
const proxyHost = process.env.EXTERNAL_ONLY_PROXY_HOST;
3134
const proxyPort = process.env.EXTERNAL_ONLY_PROXY_PORT;
@@ -156,4 +159,8 @@ export interface EnvVars {
156159
MICROS_PLATFORM_STATSD_PORT: string;
157160

158161
JIRA_TEST_SITES: string;
162+
163+
S3_DUMPS_BUCKET_NAME: string;
164+
S3_DUMPS_BUCKET_PATH: string;
165+
S3_DUMPS_BUCKET_REGION: string;
159166
}

src/config/feature-flags.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ export enum BooleanFlags {
3131
DELETE_MESSAGE_ON_BACKFILL_WHEN_OTHERS_WORKING_ON_IT = "delete-message-on-backfill-when-others-working-on-it",
3232
USE_NEW_5KU_SPA_EXPERIENCE = "enable-5ku-experience--cloud-connect",
3333
USE_INSTALLATION_CLIENT_CHECK_PERMISSION = "use-installation-client-to-check-permission",
34-
USE_CUSTOM_ROOT_CA_BUNDLE = "use-custom-root-ca-bundle"
34+
USE_CUSTOM_ROOT_CA_BUNDLE = "use-custom-root-ca-bundle",
35+
GENERATE_CORE_HEAP_DUMPS_ON_LOW_MEM = "generate-core-heap-dumps-on-low-mem"
3536
}
3637

3738
export enum StringFlags {

src/main.ts

+7-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ const CONF_WORKER_KEEP_ALIVE_PERIOD_MSEC = 7000;
2626
const CONF_MASTER_WORKERS_POLL_INTERVAL_MSEC = Math.floor(
2727
CONF_WORKER_KEEP_ALIVE_PERIOD_MSEC * (2 + Math.random()) // different from KEEP_ALIVE to keep logs separated
2828
);
29+
const CONF_WORKER_DUMP_INTERVAL_MSEC = 10000;
30+
const CONF_WORKER_DUMP_LOW_HEAP_PCT = 25;
2931

3032
const handleErrorsGracefully = (logger: Logger, intervalsToClear: NodeJS.Timeout[]) => {
3133
process.on("uncaughtExceptionMonitor", (err, origin) => {
@@ -52,7 +54,11 @@ const handleErrorsGracefully = (logger: Logger, intervalsToClear: NodeJS.Timeout
5254

5355
const troubleshootUnresponsiveWorkers_worker = () => {
5456
handleErrorsGracefully(unresponsiveWorkersLogger,
55-
[startMonitorOnWorker(unresponsiveWorkersLogger, CONF_WORKER_KEEP_ALIVE_PERIOD_MSEC)]
57+
startMonitorOnWorker(unresponsiveWorkersLogger, {
58+
iAmAliveInervalMsec: CONF_WORKER_KEEP_ALIVE_PERIOD_MSEC,
59+
dumpIntervalMsec: CONF_WORKER_DUMP_INTERVAL_MSEC,
60+
lowHeapAvailPct: CONF_WORKER_DUMP_LOW_HEAP_PCT
61+
})
5662
);
5763
};
5864

src/routes/api/api-router.test.ts

+13
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,19 @@ describe("API Router", () => {
362362

363363
});
364364

365+
describe("fill-mem-and-generate-coredump", () => {
366+
it("should return 200", () => {
367+
return supertest(app)
368+
.post("/api/fill-mem-and-generate-coredump?arraySize=2&nIter=7")
369+
.set("host", "127.0.0.1")
370+
.set("X-Slauth-Mechanism", "slauthtoken")
371+
.expect(200)
372+
.then((response) => {
373+
expect(response.body).toEqual({ allocated: 14, dumpGenerated: false });
374+
});
375+
});
376+
});
377+
365378
describe("Ping", () => {
366379

367380
it("Should fail on missing url", () => {

src/routes/api/api-router.ts

+56
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ import { ApiResetSubscriptionFailedTasks } from "./api-reset-subscription-failed
2424
import { RecoverCommitsFromDatePost } from "./commits-from-date/recover-commits-from-dates";
2525
import { ResetFailedAndPendingDeploymentCursorPost } from "./commits-from-date/reset-failed-and-pending-deployment-cursors";
2626
import { ApiRecryptPost } from "./api-recrypt-post";
27+
import { GenerateOnceCoredumpGenerator } from "services/generate-once-coredump-generator";
28+
import { GenerateOncePerNodeHeadumpGenerator } from "services/generate-once-per-node-headump-generator";
29+
2730
export const ApiRouter = Router();
2831

2932
// TODO: remove this duplication because of the horrible way to do logs through requests
@@ -98,6 +101,59 @@ ApiRouter.post("/recrypt", ApiRecryptPost);
98101

99102
ApiRouter.post("/ping", ApiPingPost);
100103

104+
/**
105+
* Workable parameters for ddev (250Mb heap):
106+
*
107+
* to occupy ~25% of mem and generate coredump:
108+
* - ?arraySize=20000&nIter=400&pctThreshold=75
109+
*
110+
* to generate coredump straight away, without occupying any extra mem:
111+
* - ?arraySize=1 &nIter=1&pcThreshold=100
112+
*
113+
* to generate heapdump:
114+
* - ?arraySize=20000&nIter=400&pctThreshold=75&heap=true
115+
*
116+
* If you are generating heapdumps, you'll need to ssh to the instance and delete the lock file, because
117+
* in production only one heapdump is allowed per node due to extremely high CPU/mem usage!
118+
*
119+
*/
120+
const FillMemAndGenerateCoreDump = (req: Request, res: Response) => {
121+
const nIter = parseInt(req.query?.nIter?.toString() || "0");
122+
const arraySize = parseInt(req.query?.arraySize?.toString() || "10");
123+
const pctThreshold = parseInt(req.query?.pctThreshold?.toString() || "50");
124+
const heap = !!req.query?.heap;
125+
req.log.info({
126+
nIter, arraySize, pctThreshold, heap
127+
}, "FillMemAndGenerateCoreDump triggered");
128+
129+
const generator: GenerateOncePerNodeHeadumpGenerator | GenerateOnceCoredumpGenerator =
130+
heap ? new GenerateOncePerNodeHeadumpGenerator({
131+
logger: req.log,
132+
lowHeapAvailPct: pctThreshold
133+
}) : new GenerateOnceCoredumpGenerator({
134+
logger: req.log,
135+
lowHeapAvailPct: pctThreshold
136+
});
137+
138+
let dumpGenerated = false;
139+
const allocate = (iter: number) => {
140+
if (generator.maybeGenerateDump()) {
141+
dumpGenerated = true;
142+
return [];
143+
}
144+
const arr = new Array(arraySize).fill(`${Math.random()} This is a test string. ${Math.random()}`);
145+
146+
if (iter + 1 < nIter) {
147+
const anotherOne = allocate(iter + 1);
148+
return arr.concat(anotherOne);
149+
}
150+
return arr;
151+
};
152+
res.json({ allocated: allocate(0).length, dumpGenerated: dumpGenerated });
153+
};
154+
155+
ApiRouter.post("/fill-mem-and-generate-coredump", FillMemAndGenerateCoreDump);
156+
101157
// TODO: remove once move to DELETE /:installationId/:jiraHost
102158
ApiRouter.delete(
103159
"/deleteInstallation/:installationId/:jiraHost/github-app-id/:gitHubAppId",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import { GenerateOnceCoredumpGenerator } from "services/generate-once-coredump-generator";
2+
import { getLogger } from "config/logger";
3+
import fs from "fs";
4+
import dumpme from "dumpme";
5+
import { hasEnoughFreeHeap } from "utils/heap-size-utils";
6+
7+
jest.mock("utils/heap-size-utils");
8+
jest.mock("fs");
9+
jest.mock("dumpme");
10+
11+
describe("coredump-generator", () => {
12+
let generator: GenerateOnceCoredumpGenerator;
13+
14+
beforeEach(() => {
15+
generator = new GenerateOnceCoredumpGenerator({
16+
logger: getLogger("test"),
17+
lowHeapAvailPct: 20
18+
});
19+
});
20+
21+
it("should not create coredump when enough memory available", () => {
22+
(hasEnoughFreeHeap as jest.Mock).mockReturnValue(true);
23+
expect(generator.maybeGenerateDump()).toBeFalsy();
24+
expect(dumpme).not.toBeCalled();
25+
expect(fs.renameSync).not.toBeCalled();
26+
});
27+
28+
it("should create coredump when heap is too small", () => {
29+
(hasEnoughFreeHeap as jest.Mock).mockReturnValue(false);
30+
expect(generator.maybeGenerateDump()).toBeTruthy();
31+
expect(dumpme).toBeCalled();
32+
expect(fs.renameSync).toBeCalledWith(`/tmp/dump_core.${process.pid.toString()}`, `/tmp/dump_core.${process.pid.toString()}.ready`);
33+
});
34+
35+
it("should create only a single coredump file", () => {
36+
(hasEnoughFreeHeap as jest.Mock).mockReturnValue(false);
37+
expect(generator.maybeGenerateDump()).toBeTruthy();
38+
expect(generator.maybeGenerateDump()).toBeFalsy();
39+
expect(dumpme).toBeCalledTimes(1);
40+
});
41+
});
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import Logger from "bunyan";
2+
import dumpme from "dumpme";
3+
import fs from "fs";
4+
import { hasEnoughFreeHeap } from "utils/heap-size-utils";
5+
6+
/**
7+
* Please note: it will generate the coredump only once for this instance! If you need to take it multiple times,
8+
* create a new object.
9+
*/
10+
export class GenerateOnceCoredumpGenerator {
11+
private config: {
12+
logger: Logger,
13+
lowHeapAvailPct: number,
14+
};
15+
16+
private coreDumpGenerated = false;
17+
18+
constructor(config: {
19+
logger: Logger,
20+
lowHeapAvailPct: number,
21+
}) {
22+
this.config = config;
23+
}
24+
25+
/**
26+
* In case of success, a file will be generated with the path /tmp/dump_core.PID
27+
*/
28+
public maybeGenerateDump(): boolean {
29+
if (this.coreDumpGenerated) {
30+
return false;
31+
}
32+
33+
if (!hasEnoughFreeHeap(this.config.lowHeapAvailPct, this.config.logger)) {
34+
this.config.logger.info(`Triggering coredump...`);
35+
36+
const tsBeforeDump = Date.now();
37+
dumpme(undefined, `/tmp/dump_core`); // pid will be added by dumpme() as a suffix
38+
const tsAfterDump = Date.now();
39+
40+
this.config.logger.info(`Core dump was created, took ${tsAfterDump - tsBeforeDump}`);
41+
42+
fs.renameSync(`/tmp/dump_core.${process.pid.toString()}`, `/tmp/dump_core.${process.pid.toString()}.ready`);
43+
this.coreDumpGenerated = true;
44+
return true;
45+
}
46+
47+
return false;
48+
}
49+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import Logger from "bunyan";
2+
import fs from "fs";
3+
import { hasEnoughFreeHeap } from "utils/heap-size-utils";
4+
import v8 from "v8";
5+
6+
const LOCk_FILE_PATH = "/tmp/hd_generated";
7+
8+
/**
9+
* Taking heapdump is very expensive, therefore it can be triggered only once per EC2 per the whole lifetime of the node!
10+
*/
11+
export class GenerateOncePerNodeHeadumpGenerator {
12+
private config: {
13+
logger: Logger,
14+
lowHeapAvailPct: number,
15+
};
16+
17+
constructor(config: {
18+
logger: Logger,
19+
lowHeapAvailPct: number,
20+
}) {
21+
this.config = config;
22+
}
23+
24+
public maybeGenerateDump() {
25+
if (this.generatedEarlier()) {
26+
return false;
27+
}
28+
29+
if (!hasEnoughFreeHeap(this.config.lowHeapAvailPct, this.config.logger)) {
30+
this.markAsGeneratedEarlier();
31+
32+
this.config.logger.info(`Triggering heapdump...`);
33+
34+
const heapSnapshotStream = v8.getHeapSnapshot();
35+
const writeStream = fs.createWriteStream("/tmp/dump_heap.generating");
36+
37+
heapSnapshotStream.pipe(writeStream);
38+
39+
writeStream.on("finish", () => {
40+
this.config.logger.info("Heapdump was generated and saved!");
41+
fs.renameSync("/tmp/dump_heap.generating", "/tmp/dump_heap.ready");
42+
});
43+
return true;
44+
}
45+
46+
return false;
47+
}
48+
49+
private generatedEarlier() {
50+
return fs.existsSync(LOCk_FILE_PATH);
51+
}
52+
53+
private markAsGeneratedEarlier() {
54+
fs.writeFileSync(LOCk_FILE_PATH, new Date().toISOString());
55+
}
56+
}

0 commit comments

Comments
 (0)