Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 28 additions & 7 deletions cyteonto/cyteonto.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,13 @@ async def _embed_user_labels(
)
return cached[0]

missing = [lbl for lbl in labels if lbl not in existing]
unique_labels = list(dict.fromkeys(labels))
missing = [lbl for lbl in unique_labels if lbl not in existing]
if missing:
logger.info(
f"Generating {len(missing)} new descriptions for '{identifier}' "
f"(cached: {len(labels) - len(missing)})"
f"({len(labels)} labels -> {len(unique_labels)} unique, "
f"{len(unique_labels) - len(missing)} cached)"
)
new_descs, sub_usage = await describe_cells(
base_agent=self.agent,
Expand All @@ -263,9 +265,10 @@ async def _embed_user_labels(
existing[lbl] = desc
storage.save_descriptions(desc_path, existing)

# Embed every label. Blanks are not cached, so the raw label text is
# used as a fallback to keep the array aligned. Next compare(...) run
# will retry description generation for those labels and overwrite.
# Build the text to embed for every label position. Blanks are not
# cached, so the raw label text is used as a fallback to keep the
# array aligned. Next compare(...) run will retry description
# generation for those labels and overwrite.
texts: list[str] = []
fallback_count = 0
for lbl in labels:
Expand All @@ -282,9 +285,27 @@ async def _embed_user_labels(
"label text as a fallback; they will be retried on the next run."
)

embeddings = await embed_texts(texts, self.embedding)
if embeddings is None:
# Only embed each unique text once, then fan results back out so the
# final array stays aligned with the original `labels` order/length.
text_to_idx: dict[str, int] = {}
unique_texts: list[str] = []
for t in texts:
if t not in text_to_idx:
text_to_idx[t] = len(unique_texts)
unique_texts.append(t)
if len(unique_texts) < len(texts):
logger.info(
f"Embedding {len(unique_texts)} unique texts for '{identifier}' "
f"({len(texts)} total label positions)"
)

unique_embeddings = await embed_texts(unique_texts, self.embedding)
if unique_embeddings is None:
raise RuntimeError(f"Failed to embed labels for '{identifier}'")
fan_out_idx = np.fromiter(
(text_to_idx[t] for t in texts), dtype=np.int64, count=len(texts)
)
embeddings = unique_embeddings[fan_out_idx]

storage.save_user_embeddings(
emb_path,
Expand Down
10 changes: 5 additions & 5 deletions modal_app/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# modal_app

Modal deployment of the `cyteonto_new` package as an HTTP service.
Modal deployment of the `cyteonto` package as an HTTP service.

## Quickstart

Expand Down Expand Up @@ -63,7 +63,7 @@ For override patterns (your own keys, different provider, custom metric paramete

## What is CyteOnto

`cyteonto_new` compares two sets of cell type annotations against the [Cell Ontology (CL)](https://obofoundry.org/ontology/cl.html). Given parallel label lists from a study author and one or more annotation algorithms, it:
`cyteonto` compares two sets of cell type annotations against the [Cell Ontology (CL)](https://obofoundry.org/ontology/cl.html). Given parallel label lists from a study author and one or more annotation algorithms, it:

1. Generates a structured description for every label with an LLM.
2. Embeds those descriptions with a configured embedding model.
Expand Down Expand Up @@ -127,7 +127,7 @@ A healthy deploy prints something like:
```
Created objects.
├── Created mount PythonPackage:modal_app
├── Created mount /home/<you>/.../cyteonto_new
├── Created mount /home/<you>/.../cyteonto
├── Created mount /home/<you>/.../pyproject.toml
├── Created function run_compare.
├── Created function setup.
Expand Down Expand Up @@ -205,7 +205,7 @@ Request body:
| `maxDescriptionConcurrency` | `int >= 1` | no | `100` | Concurrency cap for LLM description calls. |
| `usePubmedTool` | `bool` | no | `false` | If `true`, the LLM can call a PubMed abstract tool while generating each description. |
| `reasoning` | `bool` | no | `false` | Enable provider reasoning mode. Keep `false` for most use cases. |
| `metric` | `str` | no | `cosine_kernel` | See the `cyteonto_new` README for the full list. |
| `metric` | `str` | no | `cosine_kernel` | See the `cyteonto` README for the full list. |
| `metricParams` | `dict \| null` | no | `null` | Metric-specific parameters (for example `{"width": 0.25}` for `cosine_kernel`). |
| `minMatchSimilarity` | `float in [0, 1]` | no | `0.1` | Threshold below which a label is considered unmatched to any CL term. |
| `useCache` | `bool` | no | `true` | If `false`, all on-disk caches are bypassed for this run. |
Expand Down Expand Up @@ -489,4 +489,4 @@ Redeploy with `uv run modal deploy -m modal_app --env cytetrainer` after changin

## Related

- `cyteonto_new/README.md` covers the library, the similarity metrics, and the caching rules in depth.
- `cyteonto/README.md` covers the library, the similarity metrics, and the caching rules in depth.
10 changes: 5 additions & 5 deletions modal_app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

@app.function(
image=image,
volumes=VOLUME_MAP,
volumes=VOLUME_MAP, # type:ignore
secrets=[cyteonto_secrets],
timeout=app_config.MODAL_MAX_TIMEOUT_SECONDS,
cpu=app_config.WORKER_CPU,
Expand All @@ -45,15 +45,15 @@ async def run_compare(run_id: str, payload: dict[str, Any]) -> None:

@app.function(
image=image,
volumes=VOLUME_MAP,
volumes=VOLUME_MAP, # type:ignore
timeout=3600,
cpu=app_config.WORKER_CPU,
memory=app_config.WORKER_MEMORY_MB,
)
def setup(force: bool = False) -> int:
"""One-shot hook: download core ontology assets onto the Modal volume."""
from cyteonto_new.logger import logger
from cyteonto_new.setup import main as setup_main
from cyteonto.logger import logger
from cyteonto.setup import main as setup_main

argv = ["setup.py", "--data-dir", app_config.REMOTE_DATA_DIR]
if force:
Expand All @@ -79,7 +79,7 @@ def setup(force: bool = False) -> int:

@app.function(
image=image,
volumes=VOLUME_MAP,
volumes=VOLUME_MAP, # type:ignore
timeout=900,
cpu=app_config.API_CPU,
memory=app_config.API_MEMORY_MB,
Expand Down
5 changes: 3 additions & 2 deletions modal_app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class AppConfig:

PACKAGE_ROOT: Path = Path(__file__).resolve().parent.parent
PYPROJECT_PATH: Path = PACKAGE_ROOT / "pyproject.toml"
CYTEONTO_DIR: Path = PACKAGE_ROOT / "cyteonto_new"
CYTEONTO_DIR: Path = PACKAGE_ROOT / "cyteonto"

PYTHON_VERSION: str = "3.13"

Expand Down Expand Up @@ -81,8 +81,9 @@ def build_cyteonto_image() -> modal.Image:
.uv_pip_install(*deps)
.add_local_dir(
str(app_config.CYTEONTO_DIR),
"/root/cyteonto_new",
"/root/cyteonto",
copy=True,
ignore=["*.ipynb", "*.npz", "*.json"]
)
.add_local_file(
str(app_config.PYPROJECT_PATH),
Expand Down
6 changes: 3 additions & 3 deletions modal_app/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@ def build_agent(provider: str, model: str, api_key: str) -> Agent:

async def run_compare_job(run_id: str, payload: dict[str, Any], volume) -> None:
"""Execute one compare request end-to-end and update `status.json` at each stage."""
from cyteonto_new import CyteOnto
from cyteonto_new.logger import logger
from cyteonto_new.models import EmbdConfig
from cyteonto import CyteOnto
from cyteonto.logger import logger
from cyteonto.models import EmbdConfig

status = _read_status(run_id)
status.update({"state": "running", "startedAt": _utc_now()})
Expand Down
Loading