diff --git a/backend/danswer/connectors/github_files/connector.py b/backend/danswer/connectors/github_files/connector.py index 0d890923437..7d34e74e5dd 100644 --- a/backend/danswer/connectors/github_files/connector.py +++ b/backend/danswer/connectors/github_files/connector.py @@ -1,18 +1,20 @@ -"""GitHub Files connector — indexes files (default: JSON) sitting at a fixed -depth under a configurable path prefix. +"""GitHub Files connector — indexes files matching a given extension under +a configurable path prefix in a repository. -Matches the layout +Two modes: - // +- Fixed-depth (default): matches `//` + — i.e. exactly one folder under the prefix, file directly inside. Default + settings target a service-catalog layout: + service-catalog/products//.json + Anything deeper or shallower is skipped. -i.e. exactly one folder under the prefix, file directly inside that folder. -Default settings target a service-catalog layout: +- Recursive (`recursive=True`): walks every folder under `path_prefix` + (the whole repo if the prefix is empty) and matches by extension at any + depth. Useful for "index all .md files in the repo" style configurations. - service-catalog/products//.json - -Anything deeper or shallower is skipped, as are files at intermediate -directories. The connector reuses the existing GitHub access token credential -shape, so users don't need to re-enter their PAT. +The connector reuses the existing GitHub access token credential shape, so +users don't need to re-enter their PAT. """ import time from datetime import datetime @@ -76,6 +78,7 @@ def __init__( path_prefix: str = _DEFAULT_PATH_PREFIX, file_extension: str = _DEFAULT_FILE_EXTENSION, branch: str = "", + recursive: bool = False, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.repo_owner = repo_owner @@ -86,6 +89,7 @@ def __init__( file_extension if file_extension.startswith(".") else f".{file_extension}" ).lower() self.branch = branch or "" # empty -> use repo's default branch + self.recursive = recursive self.batch_size = batch_size self.github_client: Github | None = None @@ -112,7 +116,9 @@ def _resolve_branch(self, repo) -> str: def _list_matching_paths(self, repo, branch: str) -> list[tuple[str, str]]: """Walk the git tree once, returning (path, blob_sha) pairs for files - matching `//`.""" + matching the configured extension. In fixed-depth mode, only files at + `//` match; in recursive mode, + any file under `` (or the repo root) at any depth matches.""" branch_obj = _retry_on_rate_limit(self.github_client, repo.get_branch, branch) head_sha = branch_obj.commit.sha tree = _retry_on_rate_limit( @@ -129,9 +135,10 @@ def _list_matching_paths(self, repo, branch: str) -> list[tuple[str, str]]: path = element.path if prefix and not path.startswith(prefix + "/"): continue - parts = path.split("/") - if len(parts) != expected_depth: - continue + if not self.recursive: + parts = path.split("/") + if len(parts) != expected_depth: + continue if not path.lower().endswith(self.file_extension): continue results.append((path, element.sha)) @@ -177,13 +184,20 @@ def _convert_to_document( # while unchanged files stay deduped across runs. doc_id = f"{html_url}@{sha}" + # In recursive mode files at different depths can share a filename, so + # use the full repo-relative path as the semantic identifier. + if self.recursive: + semantic_identifier = path + else: + semantic_identifier = ( + f"{product_dir}/{filename}" if product_dir else filename + ) + return Document( id=doc_id, sections=[Section(link=html_url, text=text)], source=DocumentSource.GITHUB_FILES, - semantic_identifier=f"{product_dir}/{filename}" - if product_dir - else filename, + semantic_identifier=semantic_identifier, doc_updated_at=doc_updated_at, metadata={ "repo": repo.full_name, @@ -211,7 +225,7 @@ def _fetch_documents( try: commits_iter = repo.get_commits( sha=branch, - path=self.path_prefix or None, + **({"path": self.path_prefix} if self.path_prefix else {}), **({"since": start} if start else {}), **({"until": end} if end else {}), ) @@ -283,6 +297,7 @@ def poll_source( path_prefix=os.environ.get("PATH_PREFIX", _DEFAULT_PATH_PREFIX), file_extension=os.environ.get("FILE_EXTENSION", _DEFAULT_FILE_EXTENSION), branch=os.environ.get("BRANCH", ""), + recursive=os.environ.get("RECURSIVE", "").lower() in ("1", "true", "yes"), ) connector.load_credentials( {"github_access_token": os.environ["GITHUB_ACCESS_TOKEN"]} diff --git a/web/src/app/admin/connectors/github-files/page.tsx b/web/src/app/admin/connectors/github-files/page.tsx index eeea6809413..7adfe8d8dce 100644 --- a/web/src/app/admin/connectors/github-files/page.tsx +++ b/web/src/app/admin/connectors/github-files/page.tsx @@ -3,7 +3,10 @@ import * as Yup from "yup"; import { useState } from "react"; import { EditIcon, GithubIcon, TrashIcon } from "@/components/icons/icons"; -import { TextFormField } from "@/components/admin/connectors/Field"; +import { + BooleanFormField, + TextFormField, +} from "@/components/admin/connectors/Field"; import { HealthCheckBanner } from "@/components/health/healthcheck"; import useSWR, { useSWRConfig } from "swr"; import { errorHandlingFetcher } from "@/lib/fetcher"; @@ -226,7 +229,11 @@ const Main = () => { const c = ccPairStatus.connector.connector_specific_config; const ext = c.file_extension || ".json"; const branch = c.branch ? `@${c.branch}` : ""; - return `${c.path_prefix}//*${ext}${branch}`; + const root = c.path_prefix || ""; + const pattern = c.recursive + ? `${root}/**/*${ext}` + : `${root}//*${ext}`; + return `${pattern}${branch}`; }, }, ]} @@ -247,6 +254,9 @@ const Main = () => { — i.e. exactly one folder under the prefix, file directly inside. Defaults target a{" "} service-catalog/products/<product>/*.json layout. + Enable Recursive to walk every folder under the prefix + (or the whole repo if the prefix is blank) and match by extension + at any depth — e.g. *.md across the entire repo. @@ -267,21 +277,29 @@ const Main = () => { label="Path Prefix:" subtext={ <> - The folder containing per-product subfolders. Files are - indexed at exactly one level deeper. + The folder to scan. In the default mode, files are + indexed exactly one level deeper (e.g.{" "} + <prefix>/<dir>/*<ext>). In + recursive mode this is the root of the walk; leave blank + to scan the whole repository. } /> + } validationSchema={Yup.object().shape({ @@ -291,13 +309,23 @@ const Main = () => { repo_name: Yup.string().required( "Please enter the name of the repository" ), - path_prefix: Yup.string().required( - "Please enter the path prefix to scan" - ), + path_prefix: Yup.string() + .defined() + .test( + "required-unless-recursive", + "Please enter the path prefix to scan", + function (value) { + return ( + Boolean(this.parent.recursive) || + (typeof value === "string" && value.length > 0) + ); + } + ), file_extension: Yup.string().required( "Please enter the file extension to filter on" ), branch: Yup.string(), + recursive: Yup.boolean(), })} initialValues={{ repo_owner: "", @@ -305,6 +333,7 @@ const Main = () => { path_prefix: "service-catalog/products", file_extension: ".json", branch: "", + recursive: false, }} refreshFreq={10 * 60} credentialId={githubCredential.id} diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index dd5adb9685c..ad76e1f2541 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -125,6 +125,7 @@ export interface GithubFilesConfig { path_prefix: string; file_extension: string; branch?: string; + recursive?: boolean; } export interface GitlabConfig {