Skip to content

Commit 720ea69

Browse files
feat: allow configurable collector port (#5607)
* feat: allow configuring collector port * refactor * persist collector save in prod --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
1 parent b90aa12 commit 720ea69

8 files changed

Lines changed: 58 additions & 6 deletions

File tree

collector/.env.example

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# Placeholder .env file for collector runtime
22

3+
# Port the collector listens on. The server must use the same COLLECTOR_PORT value when running services separately.
4+
# COLLECTOR_PORT=8888
5+
36
# This enables HTTP request/response logging in development. Set value to truthy string to enable, leave empty value or comment out to disable
47
# ENABLE_HTTP_LOGGER=""
58
# This enables timestamps for the HTTP Logger. Set value to true to enable, leave empty or comment out to disable
6-
# ENABLE_HTTP_LOGGER_TIMESTAMPS=""
9+
# ENABLE_HTTP_LOGGER_TIMESTAMPS=""

collector/index.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const bodyParser = require("body-parser");
88
const cors = require("cors");
99
const path = require("path");
1010
const { ACCEPTED_MIMES } = require("./utils/constants");
11-
const { reqBody } = require("./utils/http");
11+
const { reqBody, getCollectorPort } = require("./utils/http");
1212
const { processSingleFile } = require("./processSingleFile");
1313
const { processLink, getLinkText } = require("./processLink");
1414
const { wipeCollectorStorage } = require("./utils/files");
@@ -18,6 +18,7 @@ const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
1818
const { httpLogger } = require("./middleware/httpLogger");
1919
const app = express();
2020
const FILE_LIMIT = "3GB";
21+
const COLLECTOR_PORT = getCollectorPort();
2122

2223
// Only log HTTP requests in development mode and if the ENABLE_HTTP_LOGGER environment variable is set to true
2324
if (
@@ -187,9 +188,9 @@ app.all("*", function (_, response) {
187188
});
188189

189190
app
190-
.listen(8888, async () => {
191+
.listen(COLLECTOR_PORT, async () => {
191192
await wipeCollectorStorage();
192-
console.log(`Document processor app listening on port 8888`);
193+
console.log(`Document processor app listening on port ${COLLECTOR_PORT}`);
193194
})
194195
.on("error", function (_) {
195196
process.once("SIGUSR2", function () {

collector/utils/http/index.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
process.env.NODE_ENV === "development"
22
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
33
: require("dotenv").config();
4+
const DEFAULT_COLLECTOR_PORT = 8888;
45

56
function reqBody(request) {
67
return typeof request.body === "string"
@@ -28,8 +29,26 @@ function validBaseUrl(baseUrl) {
2829
}
2930
}
3031

32+
/**
33+
* Gets the collector port from the environment variables.
34+
* If the port is not set, it will fall back to the default port.
35+
* If the port is invalid, it will log a warning and return the default port.
36+
* @returns {number}
37+
*/
38+
function getCollectorPort() {
39+
if (!("COLLECTOR_PORT" in process.env)) return DEFAULT_COLLECTOR_PORT;
40+
const port = Number(process.env.COLLECTOR_PORT);
41+
if (Number.isInteger(port) && port > 0 && port <= 65535) return port;
42+
43+
console.warn(
44+
`Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${DEFAULT_COLLECTOR_PORT}.`
45+
);
46+
return DEFAULT_COLLECTOR_PORT;
47+
}
48+
3149
module.exports = {
3250
reqBody,
3351
queryParams,
3452
validBaseUrl,
53+
getCollectorPort,
3554
};

docker/.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,9 @@ GID='1000'
414414
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
415415
# COLLECTOR_ALLOW_ANY_IP="true"
416416

417+
# Port the collector listens on.
418+
# COLLECTOR_PORT=8888
419+
417420
# Specify the target languages for when using OCR to parse images and PDFs.
418421
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
419422
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

server/.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,9 @@ TTS_PROVIDER="native"
418418
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
419419
# COLLECTOR_ALLOW_ANY_IP="true"
420420

421+
# Port the collector listens on. Must match the collector process when running services separately.
422+
# COLLECTOR_PORT=8888
423+
421424
# Specify the target languages for when using OCR to parse images and PDFs.
422425
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
423426
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

server/utils/collectorApi/index.js

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ const { Agent } = require("undici");
1515
// so no additional security is needed on the endpoint directly. Auth is done however by the express
1616
// middleware prior to leaving the node-side of the application so that is good enough >:)
1717
class CollectorApi {
18+
/** @type {number} - The default collector port */
19+
static DEFAULT_COLLECTOR_PORT = 8888;
20+
1821
/** @type {number} - The maximum timeout for extension requests in milliseconds */
1922
extensionRequestTimeout = 15 * 60_000; // 15 minutes
2023
/** @type {Agent} - The agent for extension requests */
@@ -23,10 +26,29 @@ class CollectorApi {
2326
bodyTimeout: this.extensionRequestTimeout,
2427
});
2528

29+
/**
30+
* Gets the collector port from the environment variables.
31+
* If the port is not set, it will fall back to the default port.
32+
* If the port is invalid, it will log a warning and return the default port.
33+
* @returns {number}
34+
*/
35+
static getCollectorPort() {
36+
if (!("COLLECTOR_PORT" in process.env)) return this.DEFAULT_COLLECTOR_PORT;
37+
const port = Number(
38+
process.env.COLLECTOR_PORT || this.DEFAULT_COLLECTOR_PORT
39+
);
40+
if (Number.isInteger(port) && port > 0 && port <= 65535) return port;
41+
42+
console.warn(
43+
`Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${this.DEFAULT_COLLECTOR_PORT}.`
44+
);
45+
return this.DEFAULT_COLLECTOR_PORT;
46+
}
47+
2648
constructor() {
2749
const { CommunicationKey } = require("../comKey");
2850
this.comkey = new CommunicationKey();
29-
this.endpoint = `http://0.0.0.0:${process.env.COLLECTOR_PORT || 8888}`;
51+
this.endpoint = `http://0.0.0.0:${CollectorApi.getCollectorPort()}`;
3052
}
3153

3254
log(text, ...args) {

server/utils/comKey/index.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ const keyPath =
1414
// that can then be appended as a header value to do integrity checking on a payload. Given the
1515
// nature of this class and that keys are rolled constantly, this protects the request
1616
// integrity of requests sent to the collector as only the server can sign these requests.
17-
// This keeps accidental misconfigurations of AnythingLLM that leaving port 8888 open from
17+
// This keeps accidental misconfigurations of AnythingLLM that leave the collector port open from
1818
// being abused or SSRF'd by users scraping malicious sites who have a loopback embedded in a <script>, for example.
1919
// Since each request to the collector must be signed to be valid, unsigned requests directly to the collector
2020
// will be dropped and must go through the /server endpoint directly.

server/utils/helpers/updateENV.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,7 @@ function dumpENV() {
12881288

12891289
"STORAGE_DIR",
12901290
"SERVER_PORT",
1291+
"COLLECTOR_PORT",
12911292
// For persistent data encryption
12921293
"SIG_KEY",
12931294
"SIG_SALT",

0 commit comments

Comments
 (0)