diff --git a/collector/.env.example b/collector/.env.example index 63daf46cf00..259cb72c14b 100644 --- a/collector/.env.example +++ b/collector/.env.example @@ -1,6 +1,9 @@ # Placeholder .env file for collector runtime +# Port the collector listens on. The server must use the same COLLECTOR_PORT value when running services separately. +# COLLECTOR_PORT=8888 + # This enables HTTP request/response logging in development. Set value to truthy string to enable, leave empty value or comment out to disable # ENABLE_HTTP_LOGGER="" # This enables timestamps for the HTTP Logger. Set value to true to enable, leave empty or comment out to disable -# ENABLE_HTTP_LOGGER_TIMESTAMPS="" \ No newline at end of file +# ENABLE_HTTP_LOGGER_TIMESTAMPS="" diff --git a/collector/index.js b/collector/index.js index 69cb2ebac9a..cbcfbca49b4 100644 --- a/collector/index.js +++ b/collector/index.js @@ -8,7 +8,7 @@ const bodyParser = require("body-parser"); const cors = require("cors"); const path = require("path"); const { ACCEPTED_MIMES } = require("./utils/constants"); -const { reqBody } = require("./utils/http"); +const { reqBody, getCollectorPort } = require("./utils/http"); const { processSingleFile } = require("./processSingleFile"); const { processLink, getLinkText } = require("./processLink"); const { wipeCollectorStorage } = require("./utils/files"); @@ -18,6 +18,7 @@ const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity"); const { httpLogger } = require("./middleware/httpLogger"); const app = express(); const FILE_LIMIT = "3GB"; +const COLLECTOR_PORT = getCollectorPort(); // Only log HTTP requests in development mode and if the ENABLE_HTTP_LOGGER environment variable is set to true if ( @@ -187,9 +188,9 @@ app.all("*", function (_, response) { }); app - .listen(8888, async () => { + .listen(COLLECTOR_PORT, async () => { await wipeCollectorStorage(); - console.log(`Document processor app listening on port 8888`); + console.log(`Document processor app listening on port ${COLLECTOR_PORT}`); }) .on("error", function (_) { process.once("SIGUSR2", function () { diff --git a/collector/utils/http/index.js b/collector/utils/http/index.js index 024d0fcfd96..8b71720b379 100644 --- a/collector/utils/http/index.js +++ b/collector/utils/http/index.js @@ -1,6 +1,7 @@ process.env.NODE_ENV === "development" ? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` }) : require("dotenv").config(); +const DEFAULT_COLLECTOR_PORT = 8888; function reqBody(request) { return typeof request.body === "string" @@ -28,8 +29,26 @@ function validBaseUrl(baseUrl) { } } +/** + * Gets the collector port from the environment variables. + * If the port is not set, it will fall back to the default port. + * If the port is invalid, it will log a warning and return the default port. + * @returns {number} + */ +function getCollectorPort() { + if (!("COLLECTOR_PORT" in process.env)) return DEFAULT_COLLECTOR_PORT; + const port = Number(process.env.COLLECTOR_PORT); + if (Number.isInteger(port) && port > 0 && port <= 65535) return port; + + console.warn( + `Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${DEFAULT_COLLECTOR_PORT}.` + ); + return DEFAULT_COLLECTOR_PORT; +} + module.exports = { reqBody, queryParams, validBaseUrl, + getCollectorPort, }; diff --git a/docker/.env.example b/docker/.env.example index 1b3fcdb97a4..fb153ce86a1 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -414,6 +414,9 @@ GID='1000' # See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information. # COLLECTOR_ALLOW_ANY_IP="true" +# Port the collector listens on. +# COLLECTOR_PORT=8888 + # Specify the target languages for when using OCR to parse images and PDFs. # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. diff --git a/server/.env.example b/server/.env.example index 70fba3cd5a9..4e4aaa57701 100644 --- a/server/.env.example +++ b/server/.env.example @@ -418,6 +418,9 @@ TTS_PROVIDER="native" # See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information. # COLLECTOR_ALLOW_ANY_IP="true" +# Port the collector listens on. Must match the collector process when running services separately. +# COLLECTOR_PORT=8888 + # Specify the target languages for when using OCR to parse images and PDFs. # This is a comma separated list of language codes as a string. Unsupported languages will be ignored. # Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index ef5b6b47a57..b3c931ff515 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -15,6 +15,9 @@ const { Agent } = require("undici"); // so no additional security is needed on the endpoint directly. Auth is done however by the express // middleware prior to leaving the node-side of the application so that is good enough >:) class CollectorApi { + /** @type {number} - The default collector port */ + static DEFAULT_COLLECTOR_PORT = 8888; + /** @type {number} - The maximum timeout for extension requests in milliseconds */ extensionRequestTimeout = 15 * 60_000; // 15 minutes /** @type {Agent} - The agent for extension requests */ @@ -23,10 +26,29 @@ class CollectorApi { bodyTimeout: this.extensionRequestTimeout, }); + /** + * Gets the collector port from the environment variables. + * If the port is not set, it will fall back to the default port. + * If the port is invalid, it will log a warning and return the default port. + * @returns {number} + */ + static getCollectorPort() { + if (!("COLLECTOR_PORT" in process.env)) return this.DEFAULT_COLLECTOR_PORT; + const port = Number( + process.env.COLLECTOR_PORT || this.DEFAULT_COLLECTOR_PORT + ); + if (Number.isInteger(port) && port > 0 && port <= 65535) return port; + + console.warn( + `Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${this.DEFAULT_COLLECTOR_PORT}.` + ); + return this.DEFAULT_COLLECTOR_PORT; + } + constructor() { const { CommunicationKey } = require("../comKey"); this.comkey = new CommunicationKey(); - this.endpoint = `http://0.0.0.0:${process.env.COLLECTOR_PORT || 8888}`; + this.endpoint = `http://0.0.0.0:${CollectorApi.getCollectorPort()}`; } log(text, ...args) { diff --git a/server/utils/comKey/index.js b/server/utils/comKey/index.js index 5cc6b0c056f..3a51b1725bf 100644 --- a/server/utils/comKey/index.js +++ b/server/utils/comKey/index.js @@ -14,7 +14,7 @@ const keyPath = // that can then be appended as a header value to do integrity checking on a payload. Given the // nature of this class and that keys are rolled constantly, this protects the request // integrity of requests sent to the collector as only the server can sign these requests. -// This keeps accidental misconfigurations of AnythingLLM that leaving port 8888 open from +// This keeps accidental misconfigurations of AnythingLLM that leave the collector port open from // being abused or SSRF'd by users scraping malicious sites who have a loopback embedded in a