From c7d5af81b767293618de89b221e93872e8824ecc Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 9 May 2026 21:48:42 +0000 Subject: [PATCH 1/2] fix: address production-readiness review across fetch, storage, parsing, CLI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire up the previously-unregistered EntityHistory and AddressHistoryJunction repositories so EntityTemporalRepo no longer throws at construction. Switch SQLite to WAL + synchronous=NORMAL so a crash mid-write can no longer corrupt the database. Add closeDb()/closePgPool() to the CLI exit path. Strip the angle brackets from the SEC User-Agent (some EDGAR endpoints 403 on RFC-5322 mailbox forms) and let deployers override it via SEC_USER_AGENT. Make the file-output cache durable: write to a unique tmp path then atomic rename, round-trip Blob → Buffer → Blob correctly, and only swallow ENOENT on read. Cache freshness comment now matches the actual comparison. Add an HTTP retry-with-Retry-After loop and a per-attempt timeout to SecFetchJob so transient 429/5xx and hung TCP connections no longer pin a queue slot. Honor the rate-limiter via the existing CompositeLimiter. Key Form._arrayPaths by the subclass constructor instead of `this.name`, removing a latent collision when two Form subclasses share a description. Centralize the ad-hoc `isBadPersonField` allowlists into one shared `bad-data.ts` registry, and add NaN-safe parseInt helpers in Form_C/Form_D storage. Use the actual US state set to derive country_code in Form 1-A instead of `length === 2`, which silently coerced GB/FR to "US". Implement `db reset --confirm` (delete-all + setupAllDatabases) and clean up the CLI: drop the unused `--concurrency` flag, validate CIK args, fix the inclusive-range math in FetchQuarterlyIndexRangeTask, restore the missing `date` field in FetchCompanyFactsTask's empty-CIK branch, drop the dead null-check in StoreCompanyFactsTask, log per-item failures in the fetchAndStore* glue, and move Sqlite.init() into the preAction hook so help/version don't load the native binding. InvestmentOfferingHistorySchema.accession_number is now 20 chars to match real SEC accession numbers (Postgres would have rejected the old 10). package.json: rename the misnamed `publish` script to `release`, point `bin` at `./dist/sec.js` so it ships with the published tarball, and move concurrently to devDependencies. Delete the leftover `test_any.ts` scratch file. https://claude.ai/code/session_01UEfSvXJcxWC1csqry7FWmJ --- bun.lock | 70 ++++++--- package.json | 8 +- src/cli/GlobalOptions.test.ts | 16 +- src/cli/GlobalOptions.ts | 5 +- src/cli/cli.integration.test.ts | 1 - src/cli/groups/db.ts | 11 +- src/cli/groups/fetch.ts | 14 +- src/commands/index.ts | 9 +- src/config/Constants.ts | 10 +- src/config/DefaultDI.ts | 25 +++ src/config/TestingDI.ts | 22 +++ src/config/resetAllDatabases.ts | 107 +++++++++++++ src/config/setupAllDatabases.ts | 4 + src/fetch/SecCachedFetchTask.ts | 15 +- src/fetch/SecFetchFileOutputCache.ts | 49 +++++- src/fetch/SecFetchJob.ts | 125 ++++++++++++++- src/sec.ts | 15 +- src/sec/forms/Form.ts | 12 +- .../exempt-offerings/Form_1_A.storage.ts | 15 +- .../forms/exempt-offerings/Form_C.storage.ts | 36 ++--- .../forms/exempt-offerings/Form_D.storage.ts | 58 +++---- .../InvestmentOfferingHistorySchema.ts | 2 +- src/task/facts/FetchCompanyFactsTask.ts | 4 +- src/task/facts/StoreCompanyFactsTask.ts | 20 ++- src/task/facts/fetchAndStoreCompanyFacts.ts | 3 +- .../index/FetchQuarterlyIndexRangeTask.ts | 14 +- .../submissions/fetchAndStoreSubmission.ts | 2 + src/types/edgar/bad-data.ts | 148 +++++++++++++----- src/util/db.ts | 15 +- test_any.ts | 5 - 30 files changed, 633 insertions(+), 207 deletions(-) create mode 100644 src/config/resetAllDatabases.ts delete mode 100644 test_any.ts diff --git a/bun.lock b/bun.lock index 4efd50c..5552288 100644 --- a/bun.lock +++ b/bun.lock @@ -14,7 +14,6 @@ "cheerio-json-mapper": "^1.0.4", "commander": "^14.0.3", "compromise": "^14.15.0", - "concurrently": "^9.2.1", "csv-parse": "^6.2.1", "fast-xml-parser": "^5.7.3", "html-entities": "^2.6.0", @@ -30,6 +29,7 @@ "@types/pg": "^8.15.2", "@types/xml2js": "^0.4.14", "bunset": "1.0.12", + "concurrently": "^9.2.1", }, "peerDependencies": { "typescript": "^6.0.3", @@ -313,7 +313,7 @@ "ansi-regex": ["ansi-regex@6.2.2", "", {}, "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg=="], - "ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + "ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], "async-retry": ["async-retry@1.3.3", "", { "dependencies": { "retry": "0.13.1" } }, "sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw=="], @@ -341,7 +341,7 @@ "call-bound": ["call-bound@1.0.4", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "get-intrinsic": "^1.3.0" } }, "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg=="], - "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "chalk": ["chalk@4.1.2", "", { "dependencies": { "ansi-styles": "^4.1.0", "supports-color": "^7.1.0" } }, "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA=="], "cheerio": ["cheerio@1.2.0", "", { "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "encoding-sniffer": "^0.2.1", "htmlparser2": "^10.1.0", "parse5": "^7.3.0", "parse5-htmlparser2-tree-adapter": "^7.1.0", "parse5-parser-stream": "^7.1.2", "undici": "^7.19.0", "whatwg-mimetype": "^4.0.0" } }, "sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg=="], @@ -563,7 +563,7 @@ "ipull": ["ipull@3.9.5", "", { "dependencies": { "@tinyhttp/content-disposition": "^2.2.0", "async-retry": "^1.3.3", "chalk": "^5.3.0", "ci-info": "^4.0.0", "cli-spinners": "^2.9.2", "commander": "^10.0.0", "eventemitter3": "^5.0.1", "filenamify": "^6.0.0", "fs-extra": "^11.1.1", "is-unicode-supported": "^2.0.0", "lifecycle-utils": "^2.0.1", "lodash.debounce": "^4.0.8", "lowdb": "^7.0.1", "pretty-bytes": "^6.1.0", "pretty-ms": "^8.0.0", "sleep-promise": "^9.1.0", "slice-ansi": "^7.1.0", "stdout-update": "^4.0.1", "strip-ansi": "^7.1.0" }, "optionalDependencies": { "@reflink/reflink": "^0.1.16" }, "bin": { "ipull": "dist/cli/cli.js" } }, "sha512-5w/yZB5lXmTfsvNawmvkCjYo4SJNuKQz/av8TC1UiOyfOHyaM+DReqbpU2XpWYfmY+NIUbRRH8PUAWsxaS+IfA=="], - "is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], + "is-fullwidth-code-point": ["is-fullwidth-code-point@3.0.0", "", {}, "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="], "is-in-ci": ["is-in-ci@2.0.0", "", { "bin": { "is-in-ci": "cli.js" } }, "sha512-cFeerHriAnhrQSbpAxL37W1wcJKUUX07HyLWZCW1URJT/ra3GyUTzBgUnh24TMVfNTV2Hij2HLxkPHFZfOZy5w=="], @@ -817,7 +817,7 @@ "steno": ["steno@4.0.2", "", {}, "sha512-yhPIQXjrlt1xv7dyPQg2P17URmXbuM5pdGkpiMB3RenprfiBlvK415Lctfe0eshk90oA7/tNq7WEiMK8RSP39A=="], - "string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], + "string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="], "strip-ansi": ["strip-ansi@7.2.0", "", { "dependencies": { "ansi-regex": "^6.2.2" } }, "sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w=="], @@ -919,9 +919,19 @@ "zod-to-json-schema": ["zod-to-json-schema@3.25.1", "", { "peerDependencies": { "zod": "^3.25 || ^4" } }, "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA=="], + "@alcalzone/ansi-tokenize/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + + "@alcalzone/ansi-tokenize/is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], + + "@inkjs/ui/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + + "@workglow/cli/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "@workglow/tasks/undici": ["undici@8.2.0", "", {}, "sha512-Z+4Hx9GE26Lh9Upwfnc8C7SsrpBPGaM/Gm6kMFtiG7c+5IvQKlXi/t+9x9DrrCh29cww5TSP9YdVaBcnLDs5fQ=="], - "cliui/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="], + "chalk/supports-color": ["supports-color@7.2.0", "", { "dependencies": { "has-flag": "^4.0.0" } }, "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw=="], + + "cli-truncate/string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], "cliui/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="], @@ -929,12 +939,18 @@ "cmake-js/which": ["which@6.0.1", "", { "dependencies": { "isexe": "^4.0.0" }, "bin": { "node-which": "bin/which.js" } }, "sha512-oGLe46MIrCRqX7ytPUf66EAYvdeMIZYn3WaocqqKZAxrBpkqHfL/qvTyJ/bTk5+AqHCjXmrv3CEWgy368zhRUg=="], - "concurrently/chalk": ["chalk@4.1.2", "", { "dependencies": { "ansi-styles": "^4.1.0", "supports-color": "^7.1.0" } }, "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA=="], - "encoding-sniffer/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], "htmlparser2/entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], + "ink/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + + "ink/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + + "ink/string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], + + "ipull/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "ipull/cli-spinners": ["cli-spinners@2.9.2", "", {}, "sha512-ywqV+5MmyL4E7ybXgKys4DugZbX0FC6LnwrhjuykIjnK9k8OQacQ7axGKnjDXWNhns0xot3bZI5h55H8yo9cJg=="], "ipull/commander": ["commander@10.0.1", "", {}, "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug=="], @@ -947,14 +963,20 @@ "matcher/escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], + "node-llama-cpp/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "node-llama-cpp/slice-ansi": ["slice-ansi@8.0.0", "", { "dependencies": { "ansi-styles": "^6.2.3", "is-fullwidth-code-point": "^5.1.0" } }, "sha512-stxByr12oeeOyY2BlviTNQlYV5xOj47GirPr4yA1hE9JCtxfQN0+tVbkxwCtYDQWhEKWFHsEK48ORg5jrouCAg=="], "node-llama-cpp/which": ["which@6.0.1", "", { "dependencies": { "isexe": "^4.0.0" }, "bin": { "node-which": "bin/which.js" } }, "sha512-oGLe46MIrCRqX7ytPUf66EAYvdeMIZYn3WaocqqKZAxrBpkqHfL/qvTyJ/bTk5+AqHCjXmrv3CEWgy368zhRUg=="], "onnxruntime-web/onnxruntime-common": ["onnxruntime-common@1.24.0-dev.20251116-b39e144322", "", {}, "sha512-BOoomdHYmNRL5r4iQ4bMvsl2t0/hzVQ3OM3PHD0gxeXu1PmggqBv3puZicEUVOA3AtHHYmqZtjMj9FOfGrATTw=="], + "ora/chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "ora/cli-cursor": ["cli-cursor@5.0.0", "", { "dependencies": { "restore-cursor": "^5.0.0" } }, "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw=="], + "ora/string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], + "parse5/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], "proper-lockfile/retry": ["retry@0.12.0", "", {}, "sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow=="], @@ -963,27 +985,39 @@ "serialize-error/type-fest": ["type-fest@0.13.1", "", {}, "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="], + "slice-ansi/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + + "slice-ansi/is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], + "stdout-update/ansi-escapes": ["ansi-escapes@6.2.1", "", {}, "sha512-4nJ3yixlEthEJ9Rk4vPcdBRkZvQZlYyu8j4/Mqz5sgIkddmEnH2Yj2ZrnP9S3tQOvSNRUIgVNF/1yPpRAGNRig=="], + "stdout-update/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + "stdout-update/string-width": ["string-width@7.2.0", "", { "dependencies": { "emoji-regex": "^10.3.0", "get-east-asian-width": "^1.0.0", "strip-ansi": "^7.1.0" } }, "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ=="], + "string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="], + "whatwg-encoding/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], - "yargs/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="], + "widest-line/string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], - "cliui/string-width/is-fullwidth-code-point": ["is-fullwidth-code-point@3.0.0", "", {}, "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="], + "wrap-ansi/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], - "cliui/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], + "wrap-ansi/string-width": ["string-width@8.2.0", "", { "dependencies": { "get-east-asian-width": "^1.5.0", "strip-ansi": "^7.1.2" } }, "sha512-6hJPQ8N0V0P3SNmP6h2J99RLuzrWz2gvT7VnK5tKvrNqJoyS9W4/Fb8mo31UiPvy00z7DQXkP2hnKBVav76thw=="], - "cliui/wrap-ansi/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], + "cliui/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], "cmake-js/which/isexe": ["isexe@4.0.0", "", {}, "sha512-FFUtZMpoZ8RqHS3XeXEmHWLA4thH+ZxCv2lOiPIn1Xc7CxrqhWzNSDzD+/chS/zbYezmiwWLdQC09JdQKmthOw=="], - "concurrently/chalk/ansi-styles": ["ansi-styles@4.3.0", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="], + "ipull/pretty-ms/parse-ms": ["parse-ms@3.0.0", "", {}, "sha512-Tpb8Z7r7XbbtBTrM9UhpkzzaMrqA2VXMT3YChzYltwV3P3pM6t8wl7TvpMnSTosz1aQAdVib7kdoys7vYOPerw=="], - "concurrently/chalk/supports-color": ["supports-color@7.2.0", "", { "dependencies": { "has-flag": "^4.0.0" } }, "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw=="], + "ipull/slice-ansi/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], - "ipull/pretty-ms/parse-ms": ["parse-ms@3.0.0", "", {}, "sha512-Tpb8Z7r7XbbtBTrM9UhpkzzaMrqA2VXMT3YChzYltwV3P3pM6t8wl7TvpMnSTosz1aQAdVib7kdoys7vYOPerw=="], + "ipull/slice-ansi/is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], + + "node-llama-cpp/slice-ansi/ansi-styles": ["ansi-styles@6.2.3", "", {}, "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg=="], + + "node-llama-cpp/slice-ansi/is-fullwidth-code-point": ["is-fullwidth-code-point@5.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.1" } }, "sha512-5XHYaSyiqADb4RnZ1Bdad6cPp8Toise4TzEjcOYDHZkTCbKgiUl7WTUCpNWHuxmDt91wnsZBc9xinNzopv3JMQ=="], "node-llama-cpp/which/isexe": ["isexe@4.0.0", "", {}, "sha512-FFUtZMpoZ8RqHS3XeXEmHWLA4thH+ZxCv2lOiPIn1Xc7CxrqhWzNSDzD+/chS/zbYezmiwWLdQC09JdQKmthOw=="], @@ -991,14 +1025,10 @@ "stdout-update/string-width/emoji-regex": ["emoji-regex@10.6.0", "", {}, "sha512-toUI84YS5YmxW219erniWD0CIVOo46xGKColeNQRgOzDorgBi1v4D71/OFzgD9GO2UGKIv1C3Sp8DAn0+j5w7A=="], - "yargs/string-width/is-fullwidth-code-point": ["is-fullwidth-code-point@3.0.0", "", {}, "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="], - - "yargs/string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="], + "string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], "ora/cli-cursor/restore-cursor/onetime": ["onetime@7.0.0", "", { "dependencies": { "mimic-function": "^5.0.0" } }, "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ=="], "ora/cli-cursor/restore-cursor/signal-exit": ["signal-exit@4.1.0", "", {}, "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw=="], - - "yargs/string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], } } diff --git a/package.json b/package.json index f5db40f..0b384ef 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "description": "Workglow SEC is an example of using the Workglow AI library to build a tool for retrieving SEC data.", "scripts": { "bunset": "bunset --patch --push", - "publish": "bun run build && bun run test && bun run bunset", + "release": "bun run build && bun run test && bun run bunset", "dev": "concurrently -c 'auto' -n 'sec:' 'bun:dev-*'", "dev-js": "bun build --watch --target=bun --sourcemap=external --packages=external --outdir ./dist ./src/sec.ts", "dev-types": "tsc --watch --preserveWatchOutput", @@ -15,7 +15,7 @@ "build-types": "rm -f tsconfig.tsbuildinfo && tsc", "test": "bun test" }, - "bin": "./src/sec.js", + "bin": "./dist/sec.js", "files": [ "dist" ], @@ -30,7 +30,6 @@ "cheerio-json-mapper": "^1.0.4", "commander": "^14.0.3", "compromise": "^14.15.0", - "concurrently": "^9.2.1", "csv-parse": "^6.2.1", "fast-xml-parser": "^5.7.3", "html-entities": "^2.6.0", @@ -44,7 +43,8 @@ "@types/bun": "1.3.13", "@types/pg": "^8.15.2", "@types/xml2js": "^0.4.14", - "bunset": "1.0.12" + "bunset": "1.0.12", + "concurrently": "^9.2.1" }, "trustedDependencies": [ "@huggingface/transformers", diff --git a/src/cli/GlobalOptions.test.ts b/src/cli/GlobalOptions.test.ts index 59242fa..0ee985b 100644 --- a/src/cli/GlobalOptions.test.ts +++ b/src/cli/GlobalOptions.test.ts @@ -18,7 +18,6 @@ describe("GlobalOptions", () => { expect(help).toContain("--verbose"); expect(help).toContain("--dry-run"); expect(help).toContain("--no-color"); - expect(help).toContain("--concurrency "); }); it("returns the program for chaining", () => { @@ -37,7 +36,6 @@ describe("GlobalOptions", () => { expect(opts.verbose).toBe(false); expect(opts.dryRun).toBe(false); expect(opts.color).toBe(true); - expect(opts.concurrency).toBeUndefined(); }); }); @@ -66,15 +64,9 @@ describe("GlobalOptions", () => { expect(parseGlobalOptions(program).color).toBe(false); }); - it("parses --concurrency with a number", () => { - const program = createProgram(); - program.parse(["--concurrency", "4"], { from: "user" }); - expect(parseGlobalOptions(program).concurrency).toBe(4); - }); - it("parses all flags together", () => { const program = createProgram(); - program.parse(["--json", "--verbose", "--dry-run", "--no-color", "--concurrency", "8"], { + program.parse(["--json", "--verbose", "--dry-run", "--no-color"], { from: "user", }); const opts = parseGlobalOptions(program); @@ -82,12 +74,6 @@ describe("GlobalOptions", () => { expect(opts.verbose).toBe(true); expect(opts.dryRun).toBe(true); expect(opts.color).toBe(false); - expect(opts.concurrency).toBe(8); - }); - - it("throws on invalid concurrency value", () => { - const program = createProgram(); - expect(() => program.parse(["--concurrency", "abc"], { from: "user" })).toThrow(); }); }); }); diff --git a/src/cli/GlobalOptions.ts b/src/cli/GlobalOptions.ts index 08c9264..d728e4f 100644 --- a/src/cli/GlobalOptions.ts +++ b/src/cli/GlobalOptions.ts @@ -5,7 +5,6 @@ export interface GlobalOptions { readonly verbose: boolean; readonly dryRun: boolean; readonly color: boolean; - readonly concurrency: number | undefined; } export function applyGlobalOptions(program: Command): Command { @@ -13,8 +12,7 @@ export function applyGlobalOptions(program: Command): Command { .option("--json", "Force JSON output", false) .option("--verbose", "Show detailed logs", false) .option("--dry-run", "Show what would happen without changes", false) - .option("--no-color", "Disable colored output") - .option("--concurrency ", "Override default concurrency", parseIntOption); + .option("--no-color", "Disable colored output"); } export function parseGlobalOptions(cmd: Command): GlobalOptions { @@ -24,7 +22,6 @@ export function parseGlobalOptions(cmd: Command): GlobalOptions { verbose: opts.verbose ?? false, dryRun: opts.dryRun ?? false, color: opts.color ?? true, - concurrency: opts.concurrency, }; } diff --git a/src/cli/cli.integration.test.ts b/src/cli/cli.integration.test.ts index 00bc801..10c1732 100644 --- a/src/cli/cli.integration.test.ts +++ b/src/cli/cli.integration.test.ts @@ -39,7 +39,6 @@ describe("CLI v2 integration", () => { expect(output).toContain("--verbose"); expect(output).toContain("--dry-run"); expect(output).toContain("--no-color"); - expect(output).toContain("--concurrency"); }); it("should show version 2.0.0", async () => { diff --git a/src/cli/groups/db.ts b/src/cli/groups/db.ts index c926c97..13d5bc5 100644 --- a/src/cli/groups/db.ts +++ b/src/cli/groups/db.ts @@ -1,8 +1,9 @@ import type { Command } from "commander"; +import { resetAllDatabases } from "../../config/resetAllDatabases"; import { setupAllDatabases } from "../../config/setupAllDatabases"; -import { runCommand } from "../runCommand"; -import { getDbStatus, getDbStats } from "../queries/DbStatus"; import { renderTable } from "../output/TableRenderer"; +import { getDbStats, getDbStatus } from "../queries/DbStatus"; +import { runCommand } from "../runCommand"; export function addDbCommands(program: Command): void { const db = program.command("db").description("Database management commands"); @@ -67,6 +68,10 @@ export function addDbCommands(program: Command): void { process.exitCode = 1; return; } - console.log("not yet implemented"); + await runCommand(async () => { + await resetAllDatabases(); + await setupAllDatabases(); + console.log("Database reset complete."); + }); }); } diff --git a/src/cli/groups/fetch.ts b/src/cli/groups/fetch.ts index d012eea..74faa77 100644 --- a/src/cli/groups/fetch.ts +++ b/src/cli/groups/fetch.ts @@ -13,6 +13,14 @@ import { secDate } from "../../util/parseDate"; import { renderTable } from "../output/TableRenderer"; import { runCommand } from "../runCommand"; +function parseCikArg(value: string): number { + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error(`Invalid CIK "${value}": must be a positive integer`); + } + return parsed; +} + async function listAvailableFormTypesForCik(cik: number): Promise { const entityRepo = new EntityRepo(); const filings = await entityRepo.getFilings(cik); @@ -62,7 +70,7 @@ export function addFetchCommands(program: Command): void { wf.pipe( new FetchSubmissionsTask({ defaults: { - cik: parseInt(cik), + cik: parseCikArg(cik), date: options.date ? secDate(options.date) : undefined, }, }), @@ -82,7 +90,7 @@ export function addFetchCommands(program: Command): void { wf.pipe( new FetchCompanyFactsTask({ defaults: { - cik: parseInt(cik), + cik: parseCikArg(cik), date: options.date ? secDate(options.date) : undefined, }, }), @@ -99,7 +107,7 @@ export function addFetchCommands(program: Command): void { ) .action(async (cik: string, form?: string, accession?: string) => { await runCommand(async () => { - const cikNum = parseInt(cik, 10); + const cikNum = parseCikArg(cik); if (form === undefined) { await listAvailableFormTypesForCik(cikNum); return; diff --git a/src/commands/index.ts b/src/commands/index.ts index 722b6ef..4ce8050 100644 --- a/src/commands/index.ts +++ b/src/commands/index.ts @@ -5,7 +5,7 @@ */ import type { Command } from "commander"; -import { getTaskQueueRegistry, globalServiceRegistry } from "workglow"; +import { getTaskQueueRegistry, globalServiceRegistry, Sqlite } from "workglow"; import { parseGlobalOptions } from "../cli/GlobalOptions"; import { addBootstrapCommands } from "../cli/groups/bootstrap"; import { addDbCommands } from "../cli/groups/db"; @@ -28,6 +28,13 @@ export const AddCommands = (program: Command): void => { if (diInitialized) return; diInitialized = true; + // Load the SQLite native binding only for commands that may open the DB, + // not for `init`, `--help`, `--version`, or any pure-CLI invocation. + const secDbType = process.env.SEC_DB_TYPE ?? "sqlite"; + if (secDbType === "sqlite" && typeof Sqlite.init === "function") { + await Sqlite.init(); + } + const globalOpts = parseGlobalOptions(program); globalServiceRegistry.registerInstance(SEC_DRY_RUN, globalOpts.dryRun); diff --git a/src/config/Constants.ts b/src/config/Constants.ts index f27ff1c..e618416 100644 --- a/src/config/Constants.ts +++ b/src/config/Constants.ts @@ -4,5 +4,13 @@ * SPDX-License-Identifier: Apache-2.0 */ -export const SecUserAgent = "PodleyAI SEC Job Queue "; +/** + * SEC fair-access policy requires a User-Agent in the form + * "Sample Company Name AdminContact@samplecompany.com" + * EDGAR has been observed to 403 on RFC-5322 angle-bracket forms. + * Override at runtime via the SEC_USER_AGENT environment variable so each + * deployer identifies themselves rather than masquerading as the default. + */ +const DEFAULT_SEC_USER_AGENT = "PodleyAI SEC Job Queue sroussey@gmail.com"; +export const SecUserAgent = process.env.SEC_USER_AGENT?.trim() || DEFAULT_SEC_USER_AGENT; export const SecJobQueueName = "sec_job_queue"; diff --git a/src/config/DefaultDI.ts b/src/config/DefaultDI.ts index 62ea0c9..96a74b2 100644 --- a/src/config/DefaultDI.ts +++ b/src/config/DefaultDI.ts @@ -5,6 +5,11 @@ */ import { globalServiceRegistry } from "workglow"; +import { + ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN, + AddressesEntityHistoryJunctionSchema, + AddressHistoryJunctionPrimaryKeyNames, +} from "../storage/address/AddressHistorySchema"; import { Address, ADDRESS_JUNCTION_REPOSITORY_TOKEN, @@ -42,6 +47,11 @@ import { CikNamePrimaryKeyNames, CikNameSchema, } from "../storage/entity/CikNameSchema"; +import { + ENTITY_HISTORY_REPOSITORY_TOKEN, + EntityHistoryPrimaryKeyNames, + EntityHistorySchema, +} from "../storage/entity/EntityHistorySchema"; import { ENTITY_REPOSITORY_TOKEN, EntityPrimaryKeyNames, @@ -197,6 +207,15 @@ export const DefaultDI = () => { ["cik"], ]) ); + globalServiceRegistry.registerInstance( + ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN, + createStorage( + "addresses_entity_history_junction", + AddressesEntityHistoryJunctionSchema, + AddressHistoryJunctionPrimaryKeyNames, + [["cik"]] + ) + ); // ------------------------------ Persons -------------------------------- globalServiceRegistry.registerInstance( PERSON_REPOSITORY_TOKEN, @@ -332,6 +351,12 @@ export const DefaultDI = () => { ENTITY_REPOSITORY_TOKEN, createStorage("entities", EntitySchema, EntityPrimaryKeyNames, [["name"], ["sic"]]) ); + globalServiceRegistry.registerInstance( + ENTITY_HISTORY_REPOSITORY_TOKEN, + createStorage("entities_history", EntityHistorySchema, EntityHistoryPrimaryKeyNames, [ + ["valid_to"], + ]) + ); globalServiceRegistry.registerInstance( ENTITY_TICKER_REPOSITORY_TOKEN, createStorage("entity_tickers", EntityTickerSchema, EntityTickerPrimaryKeyNames, [ diff --git a/src/config/TestingDI.ts b/src/config/TestingDI.ts index 188a5ec..24da324 100644 --- a/src/config/TestingDI.ts +++ b/src/config/TestingDI.ts @@ -5,6 +5,11 @@ */ import { InMemoryTabularStorage, globalServiceRegistry } from "workglow"; +import { + ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN, + AddressesEntityHistoryJunctionSchema, + AddressHistoryJunctionPrimaryKeyNames, +} from "../storage/address/AddressHistorySchema"; import { ADDRESS_JUNCTION_REPOSITORY_TOKEN, ADDRESS_REPOSITORY_TOKEN, @@ -40,6 +45,11 @@ import { CikNamePrimaryKeyNames, CikNameSchema, } from "../storage/entity/CikNameSchema"; +import { + ENTITY_HISTORY_REPOSITORY_TOKEN, + EntityHistoryPrimaryKeyNames, + EntityHistorySchema, +} from "../storage/entity/EntityHistorySchema"; import { ENTITY_REPOSITORY_TOKEN, EntityPrimaryKeyNames, @@ -255,6 +265,14 @@ export function resetDependencyInjectionsForTesting() { ["cik"], ]) ); + globalServiceRegistry.registerInstance( + ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN, + new InMemoryTabularStorage( + AddressesEntityHistoryJunctionSchema, + AddressHistoryJunctionPrimaryKeyNames, + [["cik"]] + ) + ); // Initialize Phone repositories globalServiceRegistry.registerInstance( @@ -295,6 +313,10 @@ export function resetDependencyInjectionsForTesting() { ENTITY_REPOSITORY_TOKEN, new InMemoryTabularStorage(EntitySchema, EntityPrimaryKeyNames, [["name"], ["sic"]]) ); + globalServiceRegistry.registerInstance( + ENTITY_HISTORY_REPOSITORY_TOKEN, + new InMemoryTabularStorage(EntityHistorySchema, EntityHistoryPrimaryKeyNames, [["valid_to"]]) + ); globalServiceRegistry.registerInstance( ENTITY_TICKER_REPOSITORY_TOKEN, new InMemoryTabularStorage(EntityTickerSchema, EntityTickerPrimaryKeyNames, [ diff --git a/src/config/resetAllDatabases.ts b/src/config/resetAllDatabases.ts new file mode 100644 index 0000000..952ba4d --- /dev/null +++ b/src/config/resetAllDatabases.ts @@ -0,0 +1,107 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { globalServiceRegistry } from "workglow"; +import { ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN } from "../storage/address/AddressHistorySchema"; +import { + ADDRESS_JUNCTION_REPOSITORY_TOKEN, + ADDRESS_REPOSITORY_TOKEN, +} from "../storage/address/AddressSchema"; +import { CHANGE_LOG_REPOSITORY_TOKEN } from "../storage/change-tracking/ChangeLogSchema"; +import { + COMPANY_ADDRESS_JUNCTION_REPOSITORY_TOKEN, + COMPANY_ENTITY_JUNCTION_REPOSITORY_TOKEN, + COMPANY_PHONE_JUNCTION_REPOSITORY_TOKEN, + COMPANY_PREVIOUS_NAMES_REPOSITORY_TOKEN, + COMPANY_REPOSITORY_TOKEN, +} from "../storage/company/CompanySchema"; +import { CIK_NAME_REPOSITORY_TOKEN } from "../storage/entity/CikNameSchema"; +import { ENTITY_HISTORY_REPOSITORY_TOKEN } from "../storage/entity/EntityHistorySchema"; +import { ENTITY_REPOSITORY_TOKEN } from "../storage/entity/EntitySchema"; +import { ENTITY_TICKER_REPOSITORY_TOKEN } from "../storage/entity/EntityTickerSchema"; +import { SIC_CODE_REPOSITORY_TOKEN } from "../storage/entity/SicCodeSchema"; +import { COMPANY_FACTS_REPOSITORY_TOKEN } from "../storage/facts/CompanyFactsSchema"; +import { FILING_REPOSITORY_TOKEN } from "../storage/filing/FilingSchema"; +import { INVESTMENT_OFFERING_HISTORY_REPOSITORY_TOKEN } from "../storage/investment-offering/InvestmentOfferingHistorySchema"; +import { INVESTMENT_OFFERING_REPOSITORY_TOKEN } from "../storage/investment-offering/InvestmentOfferingSchema"; +import { ISSUER_REPOSITORY_TOKEN } from "../storage/investment-offering/IssuerSchema"; +import { + PERSON_ADDRESS_JUNCTION_REPOSITORY_TOKEN, + PERSON_ENTITY_JUNCTION_REPOSITORY_TOKEN, + PERSON_PHONE_JUNCTION_REPOSITORY_TOKEN, + PERSON_PREVIOUS_NAMES_REPOSITORY_TOKEN, + PERSON_REPOSITORY_TOKEN, +} from "../storage/person/PersonSchema"; +import { + PHONE_ENTITY_JUNCTION_REPOSITORY_TOKEN, + PHONE_REPOSITORY_TOKEN, +} from "../storage/phone/PhoneSchema"; +import { CROWDFUNDING_HISTORY_REPOSITORY_TOKEN } from "../storage/portal/CrowdfundingHistorySchema"; +import { + CROWDFUNDING_OFFERINGS_REPOSITORY_TOKEN, + CROWDFUNDING_REPORTS_REPOSITORY_TOKEN, + CROWDFUNDING_REPOSITORY_TOKEN, +} from "../storage/portal/CrowdfundingSchema"; +import { PORTAL_REPOSITORY_TOKEN } from "../storage/portal/PortalSchema"; +import { CIK_LAST_UPDATE_REPOSITORY_TOKEN } from "../storage/processing/CikLastUpdateSchema"; +import { PROCESSED_FACTS_REPOSITORY_TOKEN } from "../storage/processing/ProcessedFactsSchema"; +import { PROCESSED_FILINGS_REPOSITORY_TOKEN } from "../storage/processing/ProcessedFilingsSchema"; +import { PROCESSED_SUBMISSIONS_REPOSITORY_TOKEN } from "../storage/processing/ProcessedSubmissionsSchema"; +import { REGA_EQUITY_CLASS_REPOSITORY_TOKEN } from "../storage/reg-a/RegAEquityClassSchema"; +import { REGA_FINANCIAL_DATA_REPOSITORY_TOKEN } from "../storage/reg-a/RegAFinancialDataSchema"; +import { REGA_OFFERING_HISTORY_REPOSITORY_TOKEN } from "../storage/reg-a/RegAOfferingHistorySchema"; +import { REGA_OFFERING_REPOSITORY_TOKEN } from "../storage/reg-a/RegAOfferingSchema"; +import { REGA_SERVICE_PROVIDER_REPOSITORY_TOKEN } from "../storage/reg-a/RegAServiceProviderSchema"; + +/** + * Truncates every registered repository. Used by `sec db reset --confirm` + * before re-running `setupAllDatabases()` to recreate the schema. + * + * NOTE: When adding a new repository token in DefaultDI.ts, add its + * deleteAll() call here so reset doesn't leave orphan rows behind. + */ +export async function resetAllDatabases(): Promise { + await globalServiceRegistry.get(ADDRESS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ADDRESS_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PERSON_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PERSON_ENTITY_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PERSON_ADDRESS_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PERSON_PHONE_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PERSON_PREVIOUS_NAMES_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_ENTITY_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_ADDRESS_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_PHONE_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_PREVIOUS_NAMES_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PHONE_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PHONE_ENTITY_JUNCTION_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(INVESTMENT_OFFERING_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(INVESTMENT_OFFERING_HISTORY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ISSUER_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ENTITY_HISTORY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(ENTITY_TICKER_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(SIC_CODE_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(FILING_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CROWDFUNDING_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CROWDFUNDING_OFFERINGS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CROWDFUNDING_REPORTS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CROWDFUNDING_HISTORY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CHANGE_LOG_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PORTAL_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(REGA_OFFERING_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(REGA_OFFERING_HISTORY_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(REGA_SERVICE_PROVIDER_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(REGA_FINANCIAL_DATA_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(REGA_EQUITY_CLASS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(CIK_LAST_UPDATE_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PROCESSED_FACTS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PROCESSED_SUBMISSIONS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(PROCESSED_FILINGS_REPOSITORY_TOKEN).deleteAll(); + await globalServiceRegistry.get(COMPANY_FACTS_REPOSITORY_TOKEN).deleteAll(); +} diff --git a/src/config/setupAllDatabases.ts b/src/config/setupAllDatabases.ts index cb5f96d..b01c474 100644 --- a/src/config/setupAllDatabases.ts +++ b/src/config/setupAllDatabases.ts @@ -5,6 +5,7 @@ */ import { globalServiceRegistry } from "workglow"; +import { ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN } from "../storage/address/AddressHistorySchema"; import { ADDRESS_JUNCTION_REPOSITORY_TOKEN, ADDRESS_REPOSITORY_TOKEN, @@ -18,6 +19,7 @@ import { COMPANY_REPOSITORY_TOKEN, } from "../storage/company/CompanySchema"; import { CIK_NAME_REPOSITORY_TOKEN } from "../storage/entity/CikNameSchema"; +import { ENTITY_HISTORY_REPOSITORY_TOKEN } from "../storage/entity/EntityHistorySchema"; import { ENTITY_REPOSITORY_TOKEN } from "../storage/entity/EntitySchema"; import { ENTITY_TICKER_REPOSITORY_TOKEN } from "../storage/entity/EntityTickerSchema"; import { SIC_CODE_REPOSITORY_TOKEN } from "../storage/entity/SicCodeSchema"; @@ -64,6 +66,7 @@ import { REGA_SERVICE_PROVIDER_REPOSITORY_TOKEN } from "../storage/reg-a/RegASer export async function setupAllDatabases(): Promise { await globalServiceRegistry.get(ADDRESS_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(ADDRESS_JUNCTION_REPOSITORY_TOKEN).setupDatabase(); + await globalServiceRegistry.get(ADDRESS_HISTORY_JUNCTION_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(PERSON_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(PERSON_ENTITY_JUNCTION_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(PERSON_ADDRESS_JUNCTION_REPOSITORY_TOKEN).setupDatabase(); @@ -80,6 +83,7 @@ export async function setupAllDatabases(): Promise { await globalServiceRegistry.get(INVESTMENT_OFFERING_HISTORY_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(ISSUER_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN).setupDatabase(); + await globalServiceRegistry.get(ENTITY_HISTORY_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(ENTITY_TICKER_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(SIC_CODE_REPOSITORY_TOKEN).setupDatabase(); await globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN).setupDatabase(); diff --git a/src/fetch/SecCachedFetchTask.ts b/src/fetch/SecCachedFetchTask.ts index 16b6620..32e3277 100644 --- a/src/fetch/SecCachedFetchTask.ts +++ b/src/fetch/SecCachedFetchTask.ts @@ -89,22 +89,23 @@ export abstract class SecCachedFetchTask< constructor(input: I, config: Partial = {}) { super(input as I & FetchUrlTaskInput, config); + + // response_type drives both Workglow's fetch handling and the file-cache + // serializer. We resolve it once here, after super() has populated + // `defaults`/`runInputData`, so that downstream `execute()` calls and + // any cache lookups see a consistent value. const fetchInput = this.defaults as FetchUrlTaskInput & I; if (!fetchInput.response_type) { const response_type = guessResponseType(this.inputToUrl(fetchInput as I), fetchInput); fetchInput.response_type = response_type; - (this.defaults as FetchUrlTaskInput).response_type = response_type; (this.runInputData as FetchUrlTaskInput).response_type = response_type; } if (globalServiceRegistry.has(SEC_RAW_DATA_FOLDER)) { const globalPath = globalServiceRegistry.get(SEC_RAW_DATA_FOLDER); - let folderPath; - if (globalPath.startsWith("/")) { - folderPath = path.join(globalPath); - } else { - folderPath = path.join(process.cwd(), globalPath); - } + const folderPath = globalPath.startsWith("/") + ? path.join(globalPath) + : path.join(process.cwd(), globalPath); this.runConfig.outputCache = new SecFetchFileOutputCache({ folderPath: folderPath, inputToFileName: this.inputToFileName.bind(this), diff --git a/src/fetch/SecFetchFileOutputCache.ts b/src/fetch/SecFetchFileOutputCache.ts index a6ff205..42ca1bc 100644 --- a/src/fetch/SecFetchFileOutputCache.ts +++ b/src/fetch/SecFetchFileOutputCache.ts @@ -5,12 +5,16 @@ */ import { mkdirSync } from "node:fs"; -import { mkdir, readFile, stat, writeFile } from "node:fs/promises"; +import { mkdir, readFile, rename, stat, unlink, writeFile } from "node:fs/promises"; import path from "node:path"; import { FetchUrlTaskOutput, TaskInput, TaskOutput, TaskOutputRepository } from "workglow"; import { isDryRun } from "../cli/isDryRun"; import { secDate, YYYYdMMdDD } from "../util/parseDate"; +function isNodeError(error: unknown): error is NodeJS.ErrnoException { + return error instanceof Error && "code" in error; +} + interface SecFetchFileOutputCacheOptions { folderPath: string; outputCompression?: boolean; @@ -37,6 +41,7 @@ export class SecFetchFileOutputCache extends TaskOutputRepository { } else if (response_type === "text") { return output.text; } else if (response_type === "blob") { + // writeFile cannot consume a Blob directly; convert to a Buffer. return output.blob; } else { console.warn(`Unknown response type: ${response_type}, assuming text`); @@ -53,7 +58,9 @@ export class SecFetchFileOutputCache extends TaskOutputRepository { result.text = data.toString(); } if (response_type === "blob") { - result.blob = data as Blob; + // readFile returns a Buffer; wrap it back into a Blob so downstream + // consumers see the same shape they wrote. + result.blob = data instanceof Blob ? data : new Blob([data]); } return result; } @@ -68,11 +75,30 @@ export class SecFetchFileOutputCache extends TaskOutputRepository { if (isDryRun()) { return; } + const responseType = input.response_type as string; const filePath = path.join(this.folderPath, this.inputToFileName(input)); await mkdir(path.dirname(filePath), { recursive: true }); - await writeFile(filePath, this.outputSerializer(output, input.response_type as string), { - encoding: input.response_type !== "blob" ? "utf-8" : "binary", - }); + + // Write to a unique tmp file then atomically rename so an interrupted + // write never produces a truncated cache entry, and so two concurrent + // writers cannot interleave bytes targeting the same key. + const tmpPath = `${filePath}.tmp.${process.pid}.${Date.now().toString(36)}.${Math.random() + .toString(36) + .slice(2, 10)}`; + let serialized = this.outputSerializer(output, responseType); + if (responseType === "blob" && serialized instanceof Blob) { + serialized = Buffer.from(await serialized.arrayBuffer()); + } + try { + await writeFile(tmpPath, serialized, { + encoding: responseType !== "blob" ? "utf-8" : "binary", + }); + await rename(tmpPath, filePath); + } catch (error) { + // best-effort cleanup; ignore if the tmp file was never created + await unlink(tmpPath).catch(() => undefined); + throw error; + } this.emit("output_saved", taskType); } @@ -90,7 +116,9 @@ export class SecFetchFileOutputCache extends TaskOutputRepository { try { if (inputs.date) { const stats = await stat(filePath); - // if the file was created a day before the date, return undefined + // The cache entry is fresh only if it was written on or after the + // input date; older mtimes mean SEC may have published newer data + // since the entry was cached. const fileDate = secDate(new Date(stats.mtime)); const inputDate = secDate(inputs.date); if (fileDate < inputDate) { @@ -103,7 +131,14 @@ export class SecFetchFileOutputCache extends TaskOutputRepository { this.emit("output_retrieved", taskType); return this.outputDeserializer(data, inputs.response_type as string); } - } catch (error) {} + } catch (error) { + // ENOENT is the expected "cache miss" path; surface anything else so + // permission/disk/format errors aren't silently swallowed. + if (isNodeError(error) && error.code === "ENOENT") { + return undefined; + } + throw error; + } return undefined; } diff --git a/src/fetch/SecFetchJob.ts b/src/fetch/SecFetchJob.ts index 5fbe5cc..012738a 100644 --- a/src/fetch/SecFetchJob.ts +++ b/src/fetch/SecFetchJob.ts @@ -4,9 +4,102 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { FetchUrlJob, FetchUrlTaskInput, FetchUrlTaskOutput, JobConstructorParam } from "workglow"; +import { + FetchUrlJob, + FetchUrlTaskInput, + FetchUrlTaskOutput, + IJobExecuteContext, + JobConstructorParam, +} from "workglow"; import { SecUserAgent } from "../config/Constants"; +const MAX_FETCH_ATTEMPTS = 4; +const INITIAL_BACKOFF_MS = 1_000; +const MAX_BACKOFF_MS = 30_000; +const DEFAULT_TIMEOUT_MS = Number(process.env.SEC_FETCH_TIMEOUT_MS ?? 60_000); + +interface MaybeHttpError { + status?: number; + statusCode?: number; + response?: { status?: number; headers?: Record | Headers }; + headers?: Record | Headers; + retryAfter?: number; + message?: string; +} + +function getStatus(error: MaybeHttpError): number | undefined { + return error.status ?? error.statusCode ?? error.response?.status; +} + +function isRetriableError(error: unknown): boolean { + if (!(error instanceof Error)) return false; + const status = getStatus(error as MaybeHttpError); + if (status !== undefined) { + return status === 408 || status === 429 || status >= 500; + } + // Network-level failures (ECONNRESET, ETIMEDOUT, ENOTFOUND, fetch aborts that + // weren't user-driven, etc.) all surface as plain Errors with no status. + const code = (error as NodeJS.ErrnoException).code; + if (code && /^E(CONNRESET|TIMEDOUT|PIPE|AI_AGAIN|NOTFOUND|HOSTUNREACH|NETUNREACH)$/.test(code)) { + return true; + } + return /network|timeout|fetch failed|socket hang up/i.test(error.message); +} + +function readHeader( + headers: Record | Headers | undefined, + name: string +): string | undefined { + if (!headers) return undefined; + if (typeof (headers as Headers).get === "function") { + return (headers as Headers).get(name) ?? undefined; + } + const lower = name.toLowerCase(); + for (const [key, value] of Object.entries(headers as Record)) { + if (key.toLowerCase() === lower) return value; + } + return undefined; +} + +function getRetryAfterMs(error: MaybeHttpError): number | undefined { + const fromHeader = readHeader(error.response?.headers ?? error.headers, "Retry-After"); + const raw = error.retryAfter ?? fromHeader; + if (raw === undefined) return undefined; + if (typeof raw === "number" && Number.isFinite(raw)) { + return Math.max(0, raw * 1000); + } + const numeric = Number(raw); + if (Number.isFinite(numeric)) return Math.max(0, numeric * 1000); + const dateMs = Date.parse(String(raw)); + if (Number.isFinite(dateMs)) return Math.max(0, dateMs - Date.now()); + return undefined; +} + +function backoffDelay(attempt: number): number { + const exponent = Math.min(attempt, 10); + const base = Math.min(INITIAL_BACKOFF_MS * 2 ** exponent, MAX_BACKOFF_MS); + // Jitter to avoid lockstep retries when many jobs fail at once. + return Math.floor(base * (0.5 + Math.random() * 0.5)); +} + +function combineSignals(signals: Array): AbortSignal { + const live = signals.filter((s): s is AbortSignal => Boolean(s)); + if (live.length === 0) return new AbortController().signal; + if (live.length === 1) return live[0]; + if (typeof (AbortSignal as unknown as { any?: (s: AbortSignal[]) => AbortSignal }).any === "function") { + return (AbortSignal as unknown as { any: (s: AbortSignal[]) => AbortSignal }).any(live); + } + const controller = new AbortController(); + for (const s of live) { + if (s.aborted) { + controller.abort(s.reason); + break; + } + s.addEventListener("abort", () => controller.abort(s.reason), { once: true }); + } + return controller.signal; +} + export class SecFetchJob< Input extends FetchUrlTaskInput = FetchUrlTaskInput, Output = FetchUrlTaskOutput, @@ -19,4 +112,34 @@ export class SecFetchJob< }; super({ ...config, input }); } + + async execute(input: Input, context: IJobExecuteContext): Promise { + let lastError: unknown; + for (let attempt = 0; attempt < MAX_FETCH_ATTEMPTS; attempt++) { + // Per-attempt timeout so a hung TCP connection cannot pin a queue slot + // forever; respects the caller's abort signal as well. + const timeoutSignal = + DEFAULT_TIMEOUT_MS > 0 ? AbortSignal.timeout(DEFAULT_TIMEOUT_MS) : undefined; + const signal = combineSignals([context.signal, timeoutSignal]); + + try { + return (await super.execute(input, { ...context, signal })) as Output; + } catch (error) { + lastError = error; + if (context.signal.aborted) throw error; + if (!isRetriableError(error) || attempt === MAX_FETCH_ATTEMPTS - 1) throw error; + const retryAfter = getRetryAfterMs(error as MaybeHttpError); + const delay = retryAfter ?? backoffDelay(attempt); + await new Promise((resolve, reject) => { + const timer = setTimeout(resolve, delay); + const onAbort = () => { + clearTimeout(timer); + reject(context.signal.reason ?? new Error("aborted")); + }; + context.signal.addEventListener("abort", onAbort, { once: true }); + }); + } + } + throw lastError; + } } diff --git a/src/sec.ts b/src/sec.ts index 1424e0e..80ab308 100755 --- a/src/sec.ts +++ b/src/sec.ts @@ -1,10 +1,12 @@ #!/usr/bin/env bun import { program } from "commander"; -import { getTaskQueueRegistry, Sqlite } from "workglow"; +import { getTaskQueueRegistry } from "workglow"; import { applyGlobalOptions } from "./cli/GlobalOptions"; import { AddCommands } from "./commands"; import { SecCliConfigurationError } from "./config/EnvToDI"; +import { closeDb } from "./util/db"; +import { closePgPool } from "./util/pg"; program .version("2.0.0") @@ -13,11 +15,6 @@ program applyGlobalOptions(program); AddCommands(program); -const secDbType = process.env.SEC_DB_TYPE ?? "sqlite"; -if (secDbType === "sqlite" && typeof Sqlite.init === "function") { - await Sqlite.init(); -} - try { await program.parseAsync(process.argv); } catch (e) { @@ -26,6 +23,8 @@ try { process.exit(1); } throw e; +} finally { + await getTaskQueueRegistry().stopQueues(); + closeDb(); + await closePgPool(); } - -await getTaskQueueRegistry().stopQueues(); diff --git a/src/sec/forms/Form.ts b/src/sec/forms/Form.ts index 06b174a..b2235b1 100644 --- a/src/sec/forms/Form.ts +++ b/src/sec/forms/Form.ts @@ -18,12 +18,16 @@ export abstract class Form { throw new Error(`Parsing not implemented for ${form}`); } - private static _arrayPaths = new Map(); + // Keyed by the subclass constructor itself rather than `this.name`, since + // multiple Form subclasses set `name` to a human-readable description and + // collisions there would silently return another form's array paths. + private static _arrayPaths = new WeakMap(); protected static getParser(schema: TObject): XMLParser { - if (!this._arrayPaths.has(this.name)) { - this._arrayPaths.set(this.name, extractArrayPaths(schema)); + let paths = this._arrayPaths.get(this); + if (!paths) { + paths = extractArrayPaths(schema); + this._arrayPaths.set(this, paths); } - const paths = this._arrayPaths.get(this.name)!; const options: Partial = { ignoreAttributes: true, diff --git a/src/sec/forms/exempt-offerings/Form_1_A.storage.ts b/src/sec/forms/exempt-offerings/Form_1_A.storage.ts index 7d53a16..10eb4d4 100644 --- a/src/sec/forms/exempt-offerings/Form_1_A.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_1_A.storage.ts @@ -5,10 +5,23 @@ */ import { AddressRepo } from "../../../storage/address/AddressRepo"; +import { US_STATE_CODE_ARRAY } from "../../../storage/address/AddressSchemaCodes"; import { CompanyRepo } from "../../../storage/company/CompanyRepo"; import { hasCompanyEnding } from "../../../storage/company/CompanyNormalization"; import { PersonRepo } from "../../../storage/person/PersonRepo"; import { PhoneRepo } from "../../../storage/phone/PhoneRepo"; + +const US_STATE_CODE_SET = new Set(US_STATE_CODE_ARRAY.map(([code]) => code)); + +/** + * EDGAR's `stateOrCountry` field stores either a 2-char US state code (e.g. + * "NY") or a 2-char ISO country code (e.g. "GB"). Both are 2 characters wide, + * so the country can only be inferred from set membership, not from length. + */ +function resolveCountryCode(stateOrCountry: string | undefined | null): string | undefined { + if (!stateOrCountry) return undefined; + return US_STATE_CODE_SET.has(stateOrCountry) ? "US" : stateOrCountry; +} import { RegAOfferingRepo } from "../../../storage/reg-a/RegAOfferingRepo"; import type { RegAOffering } from "../../../storage/reg-a/RegAOfferingSchema"; import type { RegAOfferingHistory } from "../../../storage/reg-a/RegAOfferingHistorySchema"; @@ -63,7 +76,7 @@ async function processIssuer(cik: number, form1A: Form1A): Promise { // Issuer phone const phone = await phoneRepo.savePhone({ phone_raw: issuerInfo.phoneNumber, - country_code: issuerInfo.stateOrCountry?.length === 2 ? "US" : issuerInfo.stateOrCountry, + country_code: resolveCountryCode(issuerInfo.stateOrCountry), }); await phoneRepo.saveRelatedEntity(phone.international_number, RELATION_TYPE_REGA_ISSUER, cik); await companyRepo.saveRelatedPhone( diff --git a/src/sec/forms/exempt-offerings/Form_C.storage.ts b/src/sec/forms/exempt-offerings/Form_C.storage.ts index 53da241..5216cf1 100644 --- a/src/sec/forms/exempt-offerings/Form_C.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_C.storage.ts @@ -11,36 +11,22 @@ import { PersonRepo } from "../../../storage/person/PersonRepo"; import { CrowdfundingRepo } from "../../../storage/portal/CrowdfundingRepo"; import { CrowdfundingTemporalRepo } from "../../../storage/portal/CrowdfundingTemporalRepo"; import type { Crowdfunding, CrowdfundingOfferings, CrowdfundingReports } from "../../../storage/portal/CrowdfundingSchema"; +import { isBadPersonField } from "../../../types/edgar/bad-data"; import type { FormC } from "./Form_C.schema"; const RELATION_TYPE_ISSUER = "form-c:issuer"; const RELATION_TYPE_CO_ISSUER = "form-c:co-issuer"; const RELATION_TYPE_SIGNATURE = "form-c:signature"; -function isBadPersonField(field: string | undefined): boolean { - const baddies = [ - "n/a", - "na", - "none", - "(none)", - "[none]", - "-", - "--", - "---", - "----", - ".", - "..", - "...", - "....", - "_", - "__", - "___", - "____", - ]; - if (!field) return true; - const fieldLower = String(field).toLowerCase().trim(); - if (baddies.includes(fieldLower)) return true; - return false; +/** + * Coerce a CIK-shaped string into a non-negative integer, or 0 if the input is + * missing/non-numeric. EDGAR occasionally emits non-digit cruft (whitespace, + * stray punctuation) that `parseInt` would silently turn into `NaN`. + */ +function parseCikSafely(raw: string | undefined | null): number { + if (!raw) return 0; + const parsed = Number.parseInt(String(raw).trim(), 10); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : 0; } /** @@ -307,7 +293,7 @@ export async function processFormC({ state_jurisdiction: issuer.legalStatus?.jurisdictionOrganization ?? "", date_incorporation: issuer.legalStatus?.dateIncorporation ?? "", url: issuer.issuerWebsite ?? "", - portal_cik: issuerInfo.commissionCik ? parseInt(issuerInfo.commissionCik) : 0, + portal_cik: parseCikSafely(issuerInfo.commissionCik), status: determineStatus(submissionType), }; diff --git a/src/sec/forms/exempt-offerings/Form_D.storage.ts b/src/sec/forms/exempt-offerings/Form_D.storage.ts index 489c46d..9e8b865 100644 --- a/src/sec/forms/exempt-offerings/Form_D.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_D.storage.ts @@ -12,6 +12,7 @@ import { PhoneRepo } from "../../../storage/phone/PhoneRepo"; import { hasCompanyEnding } from "../../../storage/company/CompanyNormalization"; import { IssuerRepo } from "../../../storage/investment-offering/IssuerRepo"; +import { isBadPersonField } from "../../../types/edgar/bad-data"; import { FormD, INDEFINITE, @@ -24,6 +25,23 @@ import { import { InvestmentOffering } from "../../../storage/investment-offering/InvestmentOfferingSchema"; import { InvestmentOfferingHistory } from "../../../storage/investment-offering/InvestmentOfferingHistorySchema"; +/** + * Coerce a numeric-shaped string into a finite integer or null. Used for + * EDGAR-emitted strings that may carry stray whitespace or non-digit cruft; + * `parseInt` would silently turn those into `NaN` and store junk. + */ +function parseIntegerOrNull(raw: string | undefined | null): number | null { + if (raw === undefined || raw === null || raw === "") return null; + const parsed = Number.parseInt(String(raw).trim(), 10); + return Number.isFinite(parsed) ? parsed : null; +} + +function parseCikSafely(raw: string | number | undefined | null): number { + if (raw === undefined || raw === null) return 0; + const parsed = Number.parseInt(String(raw).trim(), 10); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : 0; +} + // relation types for form-d const RELATION_TYPE_ISSUER = "form-d:issuer"; const RELATION_TYPE_RELATED_PERSON = "form-d:related-person"; @@ -73,10 +91,12 @@ async function processOffering( accession_number, minimum_investment_accepted: offering.minimumInvestmentAccepted, total_offering_amount: - amounts.totalOfferingAmount === INDEFINITE ? null : parseInt(amounts.totalOfferingAmount), + amounts.totalOfferingAmount === INDEFINITE + ? null + : parseIntegerOrNull(amounts.totalOfferingAmount), total_amount_sold: amounts.totalAmountSold, total_remaining: - amounts.totalRemaining === INDEFINITE ? null : parseInt(amounts.totalRemaining), + amounts.totalRemaining === INDEFINITE ? null : parseIntegerOrNull(amounts.totalRemaining), investor_count: offering.investors.totalNumberAlreadyInvested, non_accredited_count: offering.investors.numberNonAccreditedInvestors || null, }; @@ -172,38 +192,6 @@ async function processSalesCompensationRecipient(cik: number, recipient: any): P } } -function isBadPersonField(field: string | undefined): boolean { - const baddies = [ - "n/a", - "na", - "Managing Member", - "(entity)", - "[entity]", - "entity", - "none", - "(none)", - "[none]", - "-", - "--", - "---", - "----", - ".", - "..", - "...", - "....", - "_", - "__", - "___", - "____", - "a Delaware limited liability company", - ]; - if (!field) return true; - const fieldLower = String(field).toLowerCase().trim(); - if (baddies.includes(fieldLower)) return true; - - return false; -} - async function processIssuer(cik: number, issuer: Issuer, isPrimaryIssuer: boolean): Promise { // Repository instances const companyRepo = new CompanyRepo(); @@ -223,7 +211,7 @@ async function processIssuer(cik: number, issuer: Issuer, isPrimaryIssuer: boole // Save the issuer record only if it's not a self-reference // (primary issuer typically has the same CIK as the filing entity) - const issuerCik = parseInt(issuer.cik.toString()); + const issuerCik = parseCikSafely(issuer.cik); if (issuerCik !== cik) { const issuerRecord = { cik, diff --git a/src/storage/investment-offering/InvestmentOfferingHistorySchema.ts b/src/storage/investment-offering/InvestmentOfferingHistorySchema.ts index 3eb8d98..2aafed3 100644 --- a/src/storage/investment-offering/InvestmentOfferingHistorySchema.ts +++ b/src/storage/investment-offering/InvestmentOfferingHistorySchema.ts @@ -20,7 +20,7 @@ export const InvestmentOfferingHistorySchema = Type.Object({ description: "SEC file number for the offering", }), accession_number: Type.String({ - maxLength: 10, + maxLength: 20, description: "SEC accession number for the specific filing", }), minimum_investment_accepted: TypeNullable( diff --git a/src/task/facts/FetchCompanyFactsTask.ts b/src/task/facts/FetchCompanyFactsTask.ts index 9f7ad91..c486730 100644 --- a/src/task/facts/FetchCompanyFactsTask.ts +++ b/src/task/facts/FetchCompanyFactsTask.ts @@ -74,7 +74,9 @@ export class FetchCompanyFactsTask extends Task< context: IExecuteContext ): Promise { const cik = input.cik; - if (!cik) return { facts: [], cik: 0 }; + if (!cik) { + return { facts: [], cik: 0, date: input.date ? secDate(input.date) : undefined }; + } this._secFetch ??= context.own(new SecFetchCompanyFactsTask(input)); this._secFetch.setDefaults(input); diff --git a/src/task/facts/StoreCompanyFactsTask.ts b/src/task/facts/StoreCompanyFactsTask.ts index f88f921..6cdd432 100644 --- a/src/task/facts/StoreCompanyFactsTask.ts +++ b/src/task/facts/StoreCompanyFactsTask.ts @@ -5,13 +5,7 @@ */ import { Type } from "typebox"; -import { - globalServiceRegistry, - IExecuteContext, - Task, - TaskAbortedError, - TaskError, -} from "workglow"; +import { globalServiceRegistry, IExecuteContext, Task, TaskAbortedError } from "workglow"; import { Factoid } from "../../sec/facts/CompanyFacts"; import { COMPANY_FACTS_REPOSITORY_TOKEN } from "../../storage/facts/CompanyFactsSchema"; import { PROCESSED_FACTS_REPOSITORY_TOKEN } from "../../storage/processing/ProcessedFactsSchema"; @@ -49,11 +43,21 @@ export class StoreCompanyFactsTask extends Task< context: IExecuteContext ): Promise { const factsArray: Factoid[] = input.facts.filter((f) => !!f); - if (!factsArray) throw new TaskError("No facts data to store"); const companyFactsRepo = globalServiceRegistry.get(COMPANY_FACTS_REPOSITORY_TOKEN); const processedFactsRepo = globalServiceRegistry.get(PROCESSED_FACTS_REPOSITORY_TOKEN); + if (factsArray.length === 0) { + if (input.date) { + await processedFactsRepo.put({ + cik: input.cik, + last_processed: input.date, + success: true, + }); + } + return { success: true }; + } + let progress = 0; const batchSize = 1000; const batches = Math.ceil(factsArray.length / batchSize); diff --git a/src/task/facts/fetchAndStoreCompanyFacts.ts b/src/task/facts/fetchAndStoreCompanyFacts.ts index 53f7a25..25c1403 100644 --- a/src/task/facts/fetchAndStoreCompanyFacts.ts +++ b/src/task/facts/fetchAndStoreCompanyFacts.ts @@ -20,7 +20,8 @@ export async function fetchAndStoreCompanyFacts( await pipeline.run(input); success = true; } catch (e) { - // success remains false + const message = e instanceof Error ? e.message : String(e); + console.warn(`Failed to fetch/store company facts for CIK ${input.cik}: ${message}`); } finally { const processedFactsRepo = globalServiceRegistry.get(PROCESSED_FACTS_REPOSITORY_TOKEN); await processedFactsRepo.put({ diff --git a/src/task/index/FetchQuarterlyIndexRangeTask.ts b/src/task/index/FetchQuarterlyIndexRangeTask.ts index 50f91e4..a0d0399 100644 --- a/src/task/index/FetchQuarterlyIndexRangeTask.ts +++ b/src/task/index/FetchQuarterlyIndexRangeTask.ts @@ -85,16 +85,16 @@ export class FetchQuarterlyIndexRangeTask extends Task< const endYear = input.endYear ?? todayYear; const endQuarter = input.endQuarter ?? Math.ceil(todayMonth / 3); - // from the date to the current date, fetch the quarterly index - const quarters = (endYear - startYear) * 4 + (endQuarter - startQuarter); + // Walk every quarter inclusive of both endpoints, anchored on the actual + // startQuarter. The previous form both undercounted by one and ignored + // startQuarter, so a range like (2022 Q2 .. 2022 Q4) would emit Q1 dates. const dates: string[] = []; - for (let i = 0; i < quarters; i++) { + const startIndex = startQuarter - 1; // 0..3 + const endIndex = (endYear - startYear) * 4 + (endQuarter - 1); + for (let i = startIndex; i <= endIndex; i++) { const fetchYear = startYear + Math.floor(i / 4); const fetchMonth = (i % 4) * 3 + 1; - const fetchDay = 1; - dates.push( - `${fetchYear}-${fetchMonth.toString().padStart(2, "0")}-${fetchDay.toString().padStart(2, "0")}` - ); + dates.push(`${fetchYear}-${fetchMonth.toString().padStart(2, "0")}-01`); } if (dates.length === 0) return { updateList: [] }; diff --git a/src/task/submissions/fetchAndStoreSubmission.ts b/src/task/submissions/fetchAndStoreSubmission.ts index d1be856..3025a84 100644 --- a/src/task/submissions/fetchAndStoreSubmission.ts +++ b/src/task/submissions/fetchAndStoreSubmission.ts @@ -16,6 +16,8 @@ export async function fetchAndStoreSubmission( try { await pipeline.run(input); } catch (e) { + const message = e instanceof Error ? e.message : String(e); + console.warn(`Failed to fetch/store submission for CIK ${input.cik}: ${message}`); await processUpdateProcessing(input.cik, false); } // Per-item failures are recorded above; the map task itself always succeeds diff --git a/src/types/edgar/bad-data.ts b/src/types/edgar/bad-data.ts index 9661301..8fd2eda 100644 --- a/src/types/edgar/bad-data.ts +++ b/src/types/edgar/bad-data.ts @@ -1,54 +1,120 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + export type TheNadas = - | '' - | 'N/A' - | '(N/A)' - | 'N//A' - | 'N?A' - | 'N.A' - | 'N.A.' - | 'N/AQ' - | 'NONE' - | '(NONE)' - | '[NONE]' - | '-' - | '--' - | '---' - | '----' - | '-----' - | '------' + | "" + | "N/A" + | "(N/A)" + | "N//A" + | "N?A" + | "N.A" + | "N.A." + | "N/AQ" + | "NONE" + | "(NONE)" + | "[NONE]" + | "-" + | "--" + | "---" + | "----" + | "-----" + | "------" | '"-"' - | '.' - | '..' - | '...'; + | "." + | ".." + | "..."; export type TheConfused = - | ',' + | "," | '"' | "'" - | 'ETC' - | 'SAME' + | "ETC" + | "SAME" | '("ETC")' - | '(SAME)' - | '[SAME]' - | 'See Comments'; + | "(SAME)" + | "[SAME]" + | "See Comments"; export type TheEntities = - | 'CORP' - | 'CORP.' - | 'INC' - | 'INC.' - | 'LLC' - | 'L.L.C.' - | 'LTD' - | 'LTD.' - | 'PTE LTD' - | 'GP' - | 'LP' - | '(GP)' - | '(LP)' + | "CORP" + | "CORP." + | "INC" + | "INC." + | "LLC" + | "L.L.C." + | "LTD" + | "LTD." + | "PTE LTD" + | "GP" + | "LP" + | "(GP)" + | "(LP)" | '("GP")' | '("LP")' - | 'Sponsor' - | '(Sponsor)'; + | "Sponsor" + | "(Sponsor)"; export type BadName = TheNadas | TheConfused | TheEntities; + +/** + * Lowercased, trimmed forms of values EDGAR filers commonly enter when they + * have nothing real to put in a person/issuer name field. Each form parser + * used to keep its own subtly different copy of this list; centralizing here + * keeps the behaviour consistent across forms and easier to audit. + */ +const BAD_PERSON_FIELDS = new Set([ + "", + "n/a", + "(n/a)", + "n//a", + "n?a", + "n.a", + "n.a.", + "n/aq", + "na", + "none", + "(none)", + "[none]", + "-", + "--", + "---", + "----", + "-----", + "------", + '"-"', + ".", + "..", + "...", + "....", + "_", + "__", + "___", + "____", + ",", + '"', + "'", + "etc", + "same", + '("etc")', + "(same)", + "[same]", + "see comments", + "managing member", + "entity", + "(entity)", + "[entity]", + "a delaware limited liability company", +]); + +/** + * Returns true when `field` is missing or matches a known placeholder/cruft + * value that should be treated as "no real data" for person/issuer name fields. + */ +export function isBadPersonField(field: string | undefined | null): boolean { + if (field === undefined || field === null) return true; + const normalized = String(field).toLowerCase().trim(); + return BAD_PERSON_FIELDS.has(normalized); +} diff --git a/src/util/db.ts b/src/util/db.ts index c06169e..0dc2777 100644 --- a/src/util/db.ts +++ b/src/util/db.ts @@ -17,11 +17,20 @@ export function getDb(): Sqlite.Database { mkdirSync(dir, { recursive: true }); const location = path.join(dir, `${globalServiceRegistry.get(SEC_DB_NAME)}.sqlite`); db = new Sqlite.Database(location); - db.exec("PRAGMA synchronous = 0"); + // WAL + synchronous=NORMAL keeps durability across crashes while still + // allowing concurrent readers; the previous OFF/0 combination meant any + // crash mid-write could leave the database irrecoverable. + db.exec("PRAGMA journal_mode = WAL"); + db.exec("PRAGMA synchronous = NORMAL"); db.exec("PRAGMA cache_size = 1000000"); - db.exec("PRAGMA locking_mode = EXCLUSIVE"); db.exec("PRAGMA temp_store = MEMORY"); - db.exec("PRAGMA journal_mode = OFF"); } return db; } + +export function closeDb(): void { + if (db) { + db.close(); + db = null; + } +} diff --git a/test_any.ts b/test_any.ts deleted file mode 100644 index 7a1de1c..0000000 --- a/test_any.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Type } from "typebox"; - -console.log("Any:", JSON.stringify(Type.Any(), null, 2)); -console.log("String:", JSON.stringify(Type.String(), null, 2)); -console.log("Nullable Any:", JSON.stringify(Type.Union([Type.Any(), Type.Null()]), null, 2)); From e5bb21e48082620c115fe13bfee938097185ff80 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 9 May 2026 22:19:47 +0000 Subject: [PATCH 2/2] fix: address Copilot review feedback on PR #98 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tighten CIK/integer parsers in Form_C, Form_D, and the CLI fetch group to require /^\d+$/ — parseInt would silently accept "123abc" as 123 and store plausible-but-wrong CIKs. Make SecFetchJob's retry/timeout knobs env-driven (SEC_FETCH_MAX_ATTEMPTS, SEC_FETCH_INITIAL_BACKOFF_MS, SEC_FETCH_MAX_BACKOFF_MS, SEC_FETCH_TIMEOUT_MS) with strict validation that falls back to defaults on missing/invalid values. Replace AbortSignal.timeout() with a cancellable AbortController + setTimeout/clearTimeout pair so timers don't leak in a high-throughput queue. Extract sleepWithAbort() helper that detaches its abort listener on resolve. Detect workglow's RetryableJobError via name/`retryable` flag (instanceof Error is unreliable across module boundaries) and honor `retryDate` for Retry-After. Add unit tests covering 429 + Retry-After retry, 5xx retry, and 404 fail-fast. Resolve Form_1_A country codes through COUNTRY_STATE_CODE_ARRAY so phone parsing receives ISO 3166-1 alpha-2 (the documented contract for PhoneSchema.country_code) rather than SEC's "B3"/"X0"-style codes. In sec.ts, replace process.exit(1) with process.exitCode + rethrow so the shutdown finally block actually runs, and wrap stopQueues/closeDb/closePgPool in Promise.allSettled so a crashing cleanup step can't mask the primary command failure. https://claude.ai/code/session_01UEfSvXJcxWC1csqry7FWmJ --- src/cli/groups/fetch.ts | 8 +- src/fetch/SecFetchJob.test.ts | 89 ++++++++++++++++ src/fetch/SecFetchJob.ts | 100 ++++++++++++++---- src/sec.ts | 25 ++++- .../exempt-offerings/Form_1_A.storage.ts | 30 +++++- .../forms/exempt-offerings/Form_C.storage.ts | 8 +- .../forms/exempt-offerings/Form_D.storage.ts | 19 ++-- 7 files changed, 239 insertions(+), 40 deletions(-) diff --git a/src/cli/groups/fetch.ts b/src/cli/groups/fetch.ts index 74faa77..0e6b02a 100644 --- a/src/cli/groups/fetch.ts +++ b/src/cli/groups/fetch.ts @@ -14,7 +14,13 @@ import { renderTable } from "../output/TableRenderer"; import { runCommand } from "../runCommand"; function parseCikArg(value: string): number { - const parsed = Number.parseInt(value, 10); + // Require an all-digit string. parseInt would silently accept "123abc" as + // 123 and produce a plausible-looking but wrong CIK. + const trimmed = value.trim(); + if (!/^\d+$/.test(trimmed)) { + throw new Error(`Invalid CIK "${value}": must be a positive integer`); + } + const parsed = Number(trimmed); if (!Number.isFinite(parsed) || parsed <= 0) { throw new Error(`Invalid CIK "${value}": must be a positive integer`); } diff --git a/src/fetch/SecFetchJob.test.ts b/src/fetch/SecFetchJob.test.ts index c735d3c..3201be6 100644 --- a/src/fetch/SecFetchJob.test.ts +++ b/src/fetch/SecFetchJob.test.ts @@ -59,4 +59,93 @@ describe("SecFetchJob", () => { server.stop(); } }); + + describe("retry behavior", () => { + it("retries on 429 honoring Retry-After and eventually succeeds", async () => { + let attempts = 0; + const server = Bun.serve({ + port: 0, + fetch() { + attempts++; + if (attempts < 3) { + return new Response("rate limited", { + status: 429, + headers: { "Retry-After": "0" }, + }); + } + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }, + }); + try { + const url = `http://127.0.0.1:${server.port}/x.json`; + const job = new SecFetchJob({ + input: { url, response_type: "json" } satisfies FetchUrlTaskInput, + }); + const out = await job.execute(job.input, { + signal: new AbortController().signal, + updateProgress: async () => {}, + }); + expect(attempts).toBe(3); + expect((out as { json?: { ok?: boolean } }).json?.ok).toBe(true); + } finally { + server.stop(); + } + }, 15_000); + + it("retries on 5xx and ultimately surfaces the error", async () => { + let attempts = 0; + const server = Bun.serve({ + port: 0, + fetch() { + attempts++; + return new Response("boom", { status: 503 }); + }, + }); + try { + const url = `http://127.0.0.1:${server.port}/x.json`; + const job = new SecFetchJob({ + input: { url, response_type: "json" } satisfies FetchUrlTaskInput, + }); + await expect( + job.execute(job.input, { + signal: new AbortController().signal, + updateProgress: async () => {}, + }) + ).rejects.toBeDefined(); + // Retried at least once before giving up. + expect(attempts).toBeGreaterThan(1); + } finally { + server.stop(); + } + }, 30_000); + + it("does not retry on 404 (non-retriable) and fails fast", async () => { + let attempts = 0; + const server = Bun.serve({ + port: 0, + fetch() { + attempts++; + return new Response("not found", { status: 404 }); + }, + }); + try { + const url = `http://127.0.0.1:${server.port}/missing.json`; + const job = new SecFetchJob({ + input: { url, response_type: "json" } satisfies FetchUrlTaskInput, + }); + await expect( + job.execute(job.input, { + signal: new AbortController().signal, + updateProgress: async () => {}, + }) + ).rejects.toBeDefined(); + expect(attempts).toBe(1); + } finally { + server.stop(); + } + }, 10_000); + }); }); diff --git a/src/fetch/SecFetchJob.ts b/src/fetch/SecFetchJob.ts index 012738a..642fafb 100644 --- a/src/fetch/SecFetchJob.ts +++ b/src/fetch/SecFetchJob.ts @@ -13,10 +13,21 @@ import { } from "workglow"; import { SecUserAgent } from "../config/Constants"; -const MAX_FETCH_ATTEMPTS = 4; -const INITIAL_BACKOFF_MS = 1_000; -const MAX_BACKOFF_MS = 30_000; -const DEFAULT_TIMEOUT_MS = Number(process.env.SEC_FETCH_TIMEOUT_MS ?? 60_000); +function readPositiveIntEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw === "") return fallback; + const trimmed = raw.trim(); + if (!/^\d+$/.test(trimmed)) return fallback; + const parsed = Number(trimmed); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +// Retry/timeout knobs are tunable via env so deployers can tighten or loosen +// behaviour without a rebuild. Invalid values fall back to the defaults. +const MAX_FETCH_ATTEMPTS = readPositiveIntEnv("SEC_FETCH_MAX_ATTEMPTS", 4); +const INITIAL_BACKOFF_MS = readPositiveIntEnv("SEC_FETCH_INITIAL_BACKOFF_MS", 1_000); +const MAX_BACKOFF_MS = readPositiveIntEnv("SEC_FETCH_MAX_BACKOFF_MS", 30_000); +const DEFAULT_TIMEOUT_MS = readPositiveIntEnv("SEC_FETCH_TIMEOUT_MS", 60_000); interface MaybeHttpError { status?: number; @@ -24,7 +35,10 @@ interface MaybeHttpError { response?: { status?: number; headers?: Record | Headers }; headers?: Record | Headers; retryAfter?: number; + retryable?: boolean; + retryDate?: Date; message?: string; + name?: string; } function getStatus(error: MaybeHttpError): number | undefined { @@ -32,18 +46,36 @@ function getStatus(error: MaybeHttpError): number | undefined { } function isRetriableError(error: unknown): boolean { - if (!(error instanceof Error)) return false; - const status = getStatus(error as MaybeHttpError); + if (error === null || typeof error !== "object") return false; + const e = error as MaybeHttpError; + + // Workglow wraps transient HTTP failures (429/5xx, network errors) as + // RetryableJobError with `retryable: true`. Trust that flag when present. + // (Don't rely on `instanceof Error` here — RetryableJobError can fail that + // check across module/realm boundaries even when the prototype chain ends + // at a real Error.) + if (e.retryable === true || e.name === "RetryableJobError") return true; + if (e.name === "PermanentJobError" || e.retryable === false) return false; + + const status = getStatus(e); if (status !== undefined) { return status === 408 || status === 429 || status >= 500; } + const message = typeof e.message === "string" ? e.message : ""; + // Status pulled out of the message as a last resort — workglow's HTTP error + // surfaces "...: " without exposing a numeric field. + const msgStatus = message.match(/:\s*(\d{3})\s/)?.[1]; + if (msgStatus) { + const code = Number(msgStatus); + if (code === 408 || code === 429 || code >= 500) return true; + } // Network-level failures (ECONNRESET, ETIMEDOUT, ENOTFOUND, fetch aborts that // weren't user-driven, etc.) all surface as plain Errors with no status. const code = (error as NodeJS.ErrnoException).code; if (code && /^E(CONNRESET|TIMEDOUT|PIPE|AI_AGAIN|NOTFOUND|HOSTUNREACH|NETUNREACH)$/.test(code)) { return true; } - return /network|timeout|fetch failed|socket hang up/i.test(error.message); + return /network|timeout|fetch failed|socket hang up/i.test(message); } function readHeader( @@ -62,6 +94,10 @@ function readHeader( } function getRetryAfterMs(error: MaybeHttpError): number | undefined { + // Workglow's RetryableJobError exposes a parsed `retryDate`; prefer that. + if (error.retryDate instanceof Date && !Number.isNaN(error.retryDate.getTime())) { + return Math.max(0, error.retryDate.getTime() - Date.now()); + } const fromHeader = readHeader(error.response?.headers ?? error.headers, "Retry-After"); const raw = error.retryAfter ?? fromHeader; if (raw === undefined) return undefined; @@ -116,11 +152,17 @@ export class SecFetchJob< async execute(input: Input, context: IJobExecuteContext): Promise { let lastError: unknown; for (let attempt = 0; attempt < MAX_FETCH_ATTEMPTS; attempt++) { - // Per-attempt timeout so a hung TCP connection cannot pin a queue slot - // forever; respects the caller's abort signal as well. - const timeoutSignal = - DEFAULT_TIMEOUT_MS > 0 ? AbortSignal.timeout(DEFAULT_TIMEOUT_MS) : undefined; - const signal = combineSignals([context.signal, timeoutSignal]); + // Per-attempt timeout. Use an AbortController + setTimeout so we can + // clearTimeout() on success: AbortSignal.timeout() leaves an + // uncancellable timer alive, which accumulates in a high-throughput + // queue. We still combine with the caller's signal so external aborts + // win. + const timeoutController = DEFAULT_TIMEOUT_MS > 0 ? new AbortController() : undefined; + const timeoutHandle = + timeoutController && DEFAULT_TIMEOUT_MS > 0 + ? setTimeout(() => timeoutController.abort(new Error("SEC fetch timed out")), DEFAULT_TIMEOUT_MS) + : undefined; + const signal = combineSignals([context.signal, timeoutController?.signal]); try { return (await super.execute(input, { ...context, signal })) as Output; @@ -130,16 +172,34 @@ export class SecFetchJob< if (!isRetriableError(error) || attempt === MAX_FETCH_ATTEMPTS - 1) throw error; const retryAfter = getRetryAfterMs(error as MaybeHttpError); const delay = retryAfter ?? backoffDelay(attempt); - await new Promise((resolve, reject) => { - const timer = setTimeout(resolve, delay); - const onAbort = () => { - clearTimeout(timer); - reject(context.signal.reason ?? new Error("aborted")); - }; - context.signal.addEventListener("abort", onAbort, { once: true }); - }); + await sleepWithAbort(delay, context.signal); + } finally { + if (timeoutHandle !== undefined) clearTimeout(timeoutHandle); } } throw lastError; } } + +/** + * Sleep for `ms` or reject if `signal` aborts. Always detaches its abort + * listener on resolve/reject so we don't leak listeners on long-lived signals. + */ +function sleepWithAbort(ms: number, signal: AbortSignal): Promise { + return new Promise((resolve, reject) => { + if (signal.aborted) { + reject(signal.reason ?? new Error("aborted")); + return; + } + const onAbort = () => { + clearTimeout(timer); + signal.removeEventListener("abort", onAbort); + reject(signal.reason ?? new Error("aborted")); + }; + const timer = setTimeout(() => { + signal.removeEventListener("abort", onAbort); + resolve(); + }, ms); + signal.addEventListener("abort", onAbort, { once: true }); + }); +} diff --git a/src/sec.ts b/src/sec.ts index 80ab308..08bfc27 100755 --- a/src/sec.ts +++ b/src/sec.ts @@ -15,16 +15,31 @@ program applyGlobalOptions(program); AddCommands(program); +let primaryError: unknown; try { await program.parseAsync(process.argv); } catch (e) { + primaryError = e; if (e instanceof SecCliConfigurationError) { console.error(e.message); - process.exit(1); + process.exitCode = 1; } - throw e; } finally { - await getTaskQueueRegistry().stopQueues(); - closeDb(); - await closePgPool(); + // Run shutdown via allSettled so a crashing cleanup step can't mask the + // primary command failure or skip later cleanup. process.exit() would + // bypass this block entirely, so we use exitCode + rethrow instead. + const cleanups = await Promise.allSettled([ + getTaskQueueRegistry().stopQueues(), + Promise.resolve().then(() => closeDb()), + closePgPool(), + ]); + for (const result of cleanups) { + if (result.status === "rejected") { + console.error("Cleanup error:", result.reason); + } + } +} + +if (primaryError !== undefined && !(primaryError instanceof SecCliConfigurationError)) { + throw primaryError; } diff --git a/src/sec/forms/exempt-offerings/Form_1_A.storage.ts b/src/sec/forms/exempt-offerings/Form_1_A.storage.ts index 10eb4d4..bd52865 100644 --- a/src/sec/forms/exempt-offerings/Form_1_A.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_1_A.storage.ts @@ -5,7 +5,10 @@ */ import { AddressRepo } from "../../../storage/address/AddressRepo"; -import { US_STATE_CODE_ARRAY } from "../../../storage/address/AddressSchemaCodes"; +import { + COUNTRY_STATE_CODE_ARRAY, + US_STATE_CODE_ARRAY, +} from "../../../storage/address/AddressSchemaCodes"; import { CompanyRepo } from "../../../storage/company/CompanyRepo"; import { hasCompanyEnding } from "../../../storage/company/CompanyNormalization"; import { PersonRepo } from "../../../storage/person/PersonRepo"; @@ -13,14 +16,31 @@ import { PhoneRepo } from "../../../storage/phone/PhoneRepo"; const US_STATE_CODE_SET = new Set(US_STATE_CODE_ARRAY.map(([code]) => code)); +// SEC's stateOrCountry field uses SEC-specific 2-char codes for non-US +// countries (e.g. "B3" = Albania). PhoneSchema.country_code is documented as +// ISO 3166-1 alpha-2 and gets passed to phone parsing as a regionCode, so we +// have to translate. Map SEC code → ISO; also accept already-ISO inputs. +const SEC_CODE_TO_ISO = new Map( + COUNTRY_STATE_CODE_ARRAY.map(([iso, secCode]) => [secCode as string, iso as string]) +); +const ISO_CODE_SET = new Set(COUNTRY_STATE_CODE_ARRAY.map(([iso]) => iso as string)); + /** - * EDGAR's `stateOrCountry` field stores either a 2-char US state code (e.g. - * "NY") or a 2-char ISO country code (e.g. "GB"). Both are 2 characters wide, - * so the country can only be inferred from set membership, not from length. + * Resolve EDGAR's `stateOrCountry` field to an ISO 3166-1 alpha-2 country + * code. US state codes resolve to "US"; SEC country codes are mapped to ISO; + * inputs that are already ISO pass through. Returns undefined when nothing + * matches so PhoneRepo can fall back to its own defaults rather than + * receiving a bogus regionCode. */ function resolveCountryCode(stateOrCountry: string | undefined | null): string | undefined { if (!stateOrCountry) return undefined; - return US_STATE_CODE_SET.has(stateOrCountry) ? "US" : stateOrCountry; + const code = stateOrCountry.trim().toUpperCase(); + if (!code) return undefined; + if (US_STATE_CODE_SET.has(code)) return "US"; + const iso = SEC_CODE_TO_ISO.get(code); + if (iso) return iso; + if (ISO_CODE_SET.has(code)) return code; + return undefined; } import { RegAOfferingRepo } from "../../../storage/reg-a/RegAOfferingRepo"; import type { RegAOffering } from "../../../storage/reg-a/RegAOfferingSchema"; diff --git a/src/sec/forms/exempt-offerings/Form_C.storage.ts b/src/sec/forms/exempt-offerings/Form_C.storage.ts index 5216cf1..e0012d4 100644 --- a/src/sec/forms/exempt-offerings/Form_C.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_C.storage.ts @@ -25,8 +25,12 @@ const RELATION_TYPE_SIGNATURE = "form-c:signature"; */ function parseCikSafely(raw: string | undefined | null): number { if (!raw) return 0; - const parsed = Number.parseInt(String(raw).trim(), 10); - return Number.isFinite(parsed) && parsed >= 0 ? parsed : 0; + const trimmed = String(raw).trim(); + // Require all digits — parseInt would accept "123abc" as 123, which is a + // plausible but wrong CIK. + if (!/^\d+$/.test(trimmed)) return 0; + const parsed = Number(trimmed); + return Number.isFinite(parsed) ? parsed : 0; } /** diff --git a/src/sec/forms/exempt-offerings/Form_D.storage.ts b/src/sec/forms/exempt-offerings/Form_D.storage.ts index 9e8b865..4ecce68 100644 --- a/src/sec/forms/exempt-offerings/Form_D.storage.ts +++ b/src/sec/forms/exempt-offerings/Form_D.storage.ts @@ -26,20 +26,25 @@ import { InvestmentOffering } from "../../../storage/investment-offering/Investm import { InvestmentOfferingHistory } from "../../../storage/investment-offering/InvestmentOfferingHistorySchema"; /** - * Coerce a numeric-shaped string into a finite integer or null. Used for - * EDGAR-emitted strings that may carry stray whitespace or non-digit cruft; - * `parseInt` would silently turn those into `NaN` and store junk. + * Coerce a numeric-shaped string into a finite integer or null. EDGAR-emitted + * strings can carry stray whitespace or non-digit cruft; `parseInt` would + * silently swallow trailing junk (e.g. "123abc" → 123). We require the entire + * trimmed value to be digits (with an optional leading sign) before parsing. */ -function parseIntegerOrNull(raw: string | undefined | null): number | null { +function parseIntegerOrNull(raw: string | number | undefined | null): number | null { if (raw === undefined || raw === null || raw === "") return null; - const parsed = Number.parseInt(String(raw).trim(), 10); + const trimmed = String(raw).trim(); + if (!/^-?\d+$/.test(trimmed)) return null; + const parsed = Number(trimmed); return Number.isFinite(parsed) ? parsed : null; } function parseCikSafely(raw: string | number | undefined | null): number { if (raw === undefined || raw === null) return 0; - const parsed = Number.parseInt(String(raw).trim(), 10); - return Number.isFinite(parsed) && parsed >= 0 ? parsed : 0; + const trimmed = String(raw).trim(); + if (!/^\d+$/.test(trimmed)) return 0; + const parsed = Number(trimmed); + return Number.isFinite(parsed) ? parsed : 0; } // relation types for form-d