Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion collector/.env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Placeholder .env file for collector runtime

# Port the collector listens on. The server must use the same COLLECTOR_PORT value when running services separately.
# COLLECTOR_PORT=8888

# This enables HTTP request/response logging in development. Set value to truthy string to enable, leave empty value or comment out to disable
# ENABLE_HTTP_LOGGER=""
# This enables timestamps for the HTTP Logger. Set value to true to enable, leave empty or comment out to disable
# ENABLE_HTTP_LOGGER_TIMESTAMPS=""
# ENABLE_HTTP_LOGGER_TIMESTAMPS=""
7 changes: 4 additions & 3 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const bodyParser = require("body-parser");
const cors = require("cors");
const path = require("path");
const { ACCEPTED_MIMES } = require("./utils/constants");
const { reqBody } = require("./utils/http");
const { reqBody, getCollectorPort } = require("./utils/http");
const { processSingleFile } = require("./processSingleFile");
const { processLink, getLinkText } = require("./processLink");
const { wipeCollectorStorage } = require("./utils/files");
Expand All @@ -18,6 +18,7 @@ const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
const { httpLogger } = require("./middleware/httpLogger");
const app = express();
const FILE_LIMIT = "3GB";
const COLLECTOR_PORT = getCollectorPort();

// Only log HTTP requests in development mode and if the ENABLE_HTTP_LOGGER environment variable is set to true
if (
Expand Down Expand Up @@ -187,9 +188,9 @@ app.all("*", function (_, response) {
});

app
.listen(8888, async () => {
.listen(COLLECTOR_PORT, async () => {
await wipeCollectorStorage();
console.log(`Document processor app listening on port 8888`);
console.log(`Document processor app listening on port ${COLLECTOR_PORT}`);
})
.on("error", function (_) {
process.once("SIGUSR2", function () {
Expand Down
19 changes: 19 additions & 0 deletions collector/utils/http/index.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
process.env.NODE_ENV === "development"
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
: require("dotenv").config();
const DEFAULT_COLLECTOR_PORT = 8888;

function reqBody(request) {
return typeof request.body === "string"
Expand Down Expand Up @@ -28,8 +29,26 @@ function validBaseUrl(baseUrl) {
}
}

/**
* Gets the collector port from the environment variables.
* If the port is not set, it will fall back to the default port.
* If the port is invalid, it will log a warning and return the default port.
* @returns {number}
*/
function getCollectorPort() {
if (!("COLLECTOR_PORT" in process.env)) return DEFAULT_COLLECTOR_PORT;
const port = Number(process.env.COLLECTOR_PORT);
if (Number.isInteger(port) && port > 0 && port <= 65535) return port;

console.warn(
`Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${DEFAULT_COLLECTOR_PORT}.`
);
return DEFAULT_COLLECTOR_PORT;
}

module.exports = {
reqBody,
queryParams,
validBaseUrl,
getCollectorPort,
};
3 changes: 3 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,9 @@ GID='1000'
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"

# Port the collector listens on.
# COLLECTOR_PORT=8888

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
Expand Down
3 changes: 3 additions & 0 deletions server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,9 @@ TTS_PROVIDER="native"
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"

# Port the collector listens on. Must match the collector process when running services separately.
# COLLECTOR_PORT=8888

# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
Expand Down
24 changes: 23 additions & 1 deletion server/utils/collectorApi/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ const { Agent } = require("undici");
// so no additional security is needed on the endpoint directly. Auth is done however by the express
// middleware prior to leaving the node-side of the application so that is good enough >:)
class CollectorApi {
/** @type {number} - The default collector port */
static DEFAULT_COLLECTOR_PORT = 8888;

/** @type {number} - The maximum timeout for extension requests in milliseconds */
extensionRequestTimeout = 15 * 60_000; // 15 minutes
/** @type {Agent} - The agent for extension requests */
Expand All @@ -23,10 +26,29 @@ class CollectorApi {
bodyTimeout: this.extensionRequestTimeout,
});

/**
* Gets the collector port from the environment variables.
* If the port is not set, it will fall back to the default port.
* If the port is invalid, it will log a warning and return the default port.
* @returns {number}
*/
static getCollectorPort() {
if (!("COLLECTOR_PORT" in process.env)) return this.DEFAULT_COLLECTOR_PORT;
const port = Number(
process.env.COLLECTOR_PORT || this.DEFAULT_COLLECTOR_PORT
);
if (Number.isInteger(port) && port > 0 && port <= 65535) return port;

console.warn(
`Invalid COLLECTOR_PORT "${process.env.COLLECTOR_PORT}". Falling back to ${this.DEFAULT_COLLECTOR_PORT}.`
);
return this.DEFAULT_COLLECTOR_PORT;
}

constructor() {
const { CommunicationKey } = require("../comKey");
this.comkey = new CommunicationKey();
this.endpoint = `http://0.0.0.0:${process.env.COLLECTOR_PORT || 8888}`;
this.endpoint = `http://0.0.0.0:${CollectorApi.getCollectorPort()}`;
}

log(text, ...args) {
Expand Down
2 changes: 1 addition & 1 deletion server/utils/comKey/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const keyPath =
// that can then be appended as a header value to do integrity checking on a payload. Given the
// nature of this class and that keys are rolled constantly, this protects the request
// integrity of requests sent to the collector as only the server can sign these requests.
// This keeps accidental misconfigurations of AnythingLLM that leaving port 8888 open from
// This keeps accidental misconfigurations of AnythingLLM that leave the collector port open from
// being abused or SSRF'd by users scraping malicious sites who have a loopback embedded in a <script>, for example.
// Since each request to the collector must be signed to be valid, unsigned requests directly to the collector
// will be dropped and must go through the /server endpoint directly.
Expand Down
1 change: 1 addition & 0 deletions server/utils/helpers/updateENV.js
Original file line number Diff line number Diff line change
Expand Up @@ -1288,6 +1288,7 @@ function dumpENV() {

"STORAGE_DIR",
"SERVER_PORT",
"COLLECTOR_PORT",
// For persistent data encryption
"SIG_KEY",
"SIG_SALT",
Expand Down