diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index af3ac1bd..26cb3eb1 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -797,13 +797,18 @@ def _sanitize_concept_name(name: str) -> str: _parse_yaml_list_value = frontmatter.parse_list_value -def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "") -> None: - """Write or update a concept page, managing the sources frontmatter.""" - concepts_dir = wiki_dir / "concepts" - concepts_dir.mkdir(parents=True, exist_ok=True) +def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "", topic_dir: Path | None = None) -> None: + """Write or update a concept page, managing the sources frontmatter. + + When ``topic_dir`` is given (topic-tree mode) the page is written there + instead of the flat ``concepts/`` directory; the basename is unchanged so + name-based wikilinks still resolve. + """ + base_dir = topic_dir if topic_dir is not None else (wiki_dir / "concepts") + base_dir.mkdir(parents=True, exist_ok=True) safe_name = _sanitize_concept_name(name) - path = (concepts_dir / f"{safe_name}.md").resolve() - if not path.is_relative_to(concepts_dir.resolve()): + path = (base_dir / f"{safe_name}.md").resolve() + if not path.is_relative_to(base_dir.resolve()): logger.warning("Concept name escapes concepts dir: %s", name) return diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 5a755d76..9fd913af 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -9,6 +9,7 @@ from openkb.config import get_extra_headers, get_timeout_extra_args from openkb.agent.tools import ( get_wiki_page_content, + read_topic_node, read_wiki_file, read_wiki_image, write_kb_file, @@ -48,10 +49,45 @@ """ +_QUERY_INSTRUCTIONS_TREE = """\ +You are OpenKB, a knowledge-base Q&A agent. You answer questions by searching the wiki. + +{schema_md} + +## Search strategy (topic tree) +The concepts/ wiki is a TOPIC TREE — descend it, do not enumerate everything. +1. Call read_topic("") to see the root summary, its child topics, and any concepts there. +2. Pick the child topic(s) most relevant to the question; call read_topic("") + to descend (paths nest, e.g. "attention/multi-head"). +3. Repeat until you reach the relevant concept leaves (listed under "concepts here"). +4. read_file the relevant concept pages. For "who/what is X" about a named person, + organization, place, or product, read the matching entities/ page. +5. For detailed source content, follow a summary page's `full_text` frontmatter: + short docs → read_file that path; pageindex docs → get_page_content(doc_name, pages) + with tight page ranges. Never fetch a whole document. +6. Source content may reference images; use get_image when needed. +7. If a branch has nothing useful, back up and try a sibling. Synthesize a clear, + concise, well-cited answer grounded in wiki content. + +Answer based only on wiki content. Be concise. +Before each tool call, output one short sentence explaining the reason. + +If you cannot find relevant information, say so clearly. +""" + + def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: """Build and return the Q&A agent.""" schema_md = get_agents_md(Path(wiki_root)) - instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) + from openkb.config import load_config + + tree_on = bool( + load_config(Path(wiki_root).parent / ".openkb" / "config.yaml").get( + "topic_tree", False + ) + ) + template = _QUERY_INSTRUCTIONS_TREE if tree_on else _QUERY_INSTRUCTIONS_TEMPLATE + instructions = template.format(schema_md=schema_md) instructions += f"\n\nIMPORTANT: Answer in {language} language." @function_tool @@ -88,12 +124,27 @@ def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: return ToolOutputImage(image_url=result["image_url"]) return ToolOutputText(text=result["text"]) + @function_tool + def read_topic(rel: str = "") -> str: + """Navigate the concept topic tree top-down. + + Start at "" (root); the result lists child topics and the concepts at + this node. Descend by calling again with a child topic's path (e.g. + "attention" or "attention/multi-head"); read concept leaves with + read_file. Do not enumerate the whole tree. + """ + return read_topic_node(rel, wiki_root) + from agents.model_settings import ModelSettings + tools = [read_file, get_page_content, get_image] + if tree_on: + tools.append(read_topic) + return Agent( name="wiki-query", instructions=instructions, - tools=[read_file, get_page_content, get_image], + tools=tools, model=f"litellm/{model}", model_settings=ModelSettings( parallel_tool_calls=False, diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index f954623f..23248185 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -54,6 +54,32 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +def read_topic_node(rel: str, wiki_root: str) -> str: + """Render a topic node: its summary, child topics, and concept briefs. + + Use to navigate the concept topic tree top-down: start at ``""`` (root), + pick a child topic, call again with its path, until you reach the concept + leaves you need (then read them with read_wiki_file). + + Args: + rel: Topic path relative to ``concepts/`` (``""`` for root, + ``"attention"``, ``"attention/multi-head"``). + wiki_root: Absolute path to the wiki root directory. + """ + from openkb.topic_tree import read_topic + + concepts_root = Path(wiki_root) / "concepts" + view = read_topic(concepts_root, rel) + lines = [f"# topic: {rel or '(root)'}", "", view.summary, ""] + if view.child_topics: + lines.append("## child topics") + lines += [f"- {n}: {s}" for n, s in view.child_topics] + if view.child_concepts: + lines.append("## concepts here") + lines += [f"- [[{stem}]]: {brief}" for stem, brief in view.child_concepts] + return "\n".join(lines) + + def parse_pages(pages: str) -> list[int]: """Parse a page specification string into a sorted, deduplicated list of page numbers. diff --git a/openkb/cli.py b/openkb/cli.py index 28694987..2894eae5 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -52,6 +52,7 @@ def filter(self, record: logging.LogRecord) -> bool: from openkb.locks import atomic_write_json, atomic_write_text, kb_ingest_lock, kb_read_lock from openkb.log import append_log from openkb.schema import AGENTS_MD, INDEX_SEED, PAGE_CONTENT_DIRS +from openkb.topic_tree import bootstrap as tt_bootstrap # Suppress warnings after all imports — markitdown overrides filters at import time import warnings @@ -1691,6 +1692,38 @@ def lint(ctx, fix): asyncio.run(run_lint(kb_dir)) +@cli.command() +@click.pass_context +def reindex(ctx): + """Build the concept topic tree from the existing flat wiki/concepts/ (experimental). + + No-op unless `topic_tree: true` is set in .openkb/config.yaml. + """ + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) + if kb_dir is None: + click.echo("No knowledge base found. Run `openkb init` first.") + return + config = load_config(kb_dir / ".openkb" / "config.yaml") + if not bool(config.get("topic_tree", False)): + click.echo( + "topic_tree is not enabled. Set `topic_tree: true` in " + ".openkb/config.yaml first." + ) + return + _setup_llm_key(kb_dir) + model = config.get("model", DEFAULT_CONFIG["model"]) + from openkb.topic_tree_llm import make_cluster, make_summarize + + concepts_root = kb_dir / "wiki" / "concepts" + with kb_ingest_lock(kb_dir / ".openkb"): + n = tt_bootstrap( + concepts_root, + cluster=make_cluster(model), + summarize=make_summarize(model), + ) + click.echo(f"Reindexed {n} concept(s) into the topic tree.") + + @cli.command() @click.option("--open/--no-open", "open_browser", default=True, help="Open the graph in your browser after generating (default: on; --no-open for headless).") diff --git a/openkb/lint.py b/openkb/lint.py index 8b7674b7..9ff73b82 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -166,12 +166,23 @@ def list_existing_wiki_targets(wiki_dir: Path) -> set[str]: Used to seed the whitelist passed to :func:`strip_ghost_wikilinks` from both the compile pipeline and any other code path that writes LLM-generated content to the wiki (e.g. ``openkb query --save``). + + Concepts may be nested under a topic tree, so they are indexed + recursively by BOTH their relative path (``concepts/<...>/``) and + their bare ```` (Obsidian-style, path-independent) — the bare stem + is what lets a link survive a topic split that moves the file. """ targets: set[str] = set() concepts_dir = wiki_dir / "concepts" summaries_dir = wiki_dir / "summaries" if concepts_dir.is_dir(): - targets.update(f"concepts/{p.stem}" for p in concepts_dir.glob("*.md")) + for p in concepts_dir.rglob("*.md"): + if p.name == "_topic.md": + continue + rel = p.relative_to(wiki_dir).with_suffix("") + targets.add(str(rel).replace("\\", "/")) # concepts/<...>/ + targets.add(f"concepts/{p.stem}") # path-independent concepts/ + targets.add(p.stem) # bare if summaries_dir.is_dir(): targets.update(f"summaries/{p.stem}" for p in summaries_dir.glob("*.md")) entities_dir = wiki_dir / "entities" diff --git a/openkb/topic_tree.py b/openkb/topic_tree.py new file mode 100644 index 00000000..45d14754 --- /dev/null +++ b/openkb/topic_tree.py @@ -0,0 +1,232 @@ +"""Generic hierarchical-index engine over a page collection. + +A topic node is a directory containing a ``_topic.md`` (summary + size). +Children are derived from the directory: subdirectories are child topics, +``*.md`` files (except ``_topic.md``) are concept leaves. The POC wires +this to ``wiki/concepts/`` only; entities/documents can reuse it later by +passing different callables. +""" +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Optional + +import yaml + +from openkb.locks import atomic_write_text + +FANOUT_K = 10 +MAX_DEPTH = 6 +TOPIC_FILE = "_topic.md" + + +@dataclass +class TopicNodeView: + summary: str + child_topics: list[tuple[str, str]] = field(default_factory=list) # (name, summary) + child_concepts: list[tuple[str, str]] = field(default_factory=list) # (stem, brief) + + +def _frontmatter(md: Path) -> dict: + if not md.is_file(): + return {} + text = md.read_text(encoding="utf-8") + m = re.match(r"^---\n(.*?)\n---\n", text, re.DOTALL) + if not m: + return {} + try: + data = yaml.safe_load(m.group(1)) or {} + except yaml.YAMLError: + return {} + return data if isinstance(data, dict) else {} + + +def _brief(concept_md: Path) -> str: + return str(_frontmatter(concept_md).get("description", "")).strip() + + +def write_topic_md(node_dir: Path, summary: str, size: int) -> None: + node_dir.mkdir(parents=True, exist_ok=True) + # Dump the frontmatter as a mapping (not a bare scalar) so PyYAML never + # emits a ``...`` document-end marker that would corrupt the block, and + # multi-line summaries are properly escaped/round-tripped. + fm = yaml.safe_dump( + {"type": "topic", "summary": summary, "size": int(size)}, + sort_keys=False, + allow_unicode=True, + ).strip() + body = f"---\n{fm}\n---\n\n# {node_dir.name or 'root'}\n\n{summary}\n" + atomic_write_text(node_dir / TOPIC_FILE, body) + + +def child_count(node_dir: Path) -> int: + subtopics = [d for d in node_dir.iterdir() if d.is_dir()] + concepts = [f for f in node_dir.glob("*.md") if f.name != TOPIC_FILE] + return len(subtopics) + len(concepts) + + +def read_topic(concepts_root: Path, rel: str = "") -> TopicNodeView: + node_dir = concepts_root if not rel else concepts_root / rel + summary = str(_frontmatter(node_dir / TOPIC_FILE).get("summary", "")).strip() + child_topics: list[tuple[str, str]] = [] + child_concepts: list[tuple[str, str]] = [] + if node_dir.is_dir(): + for child in sorted(node_dir.iterdir()): + if child.is_dir(): + sub_sum = str(_frontmatter(child / TOPIC_FILE).get("summary", "")).strip() + child_topics.append((child.name, sub_sum)) + elif child.suffix == ".md" and child.name != TOPIC_FILE: + child_concepts.append((child.stem, _brief(child))) + return TopicNodeView( + summary=summary, child_topics=child_topics, child_concepts=child_concepts + ) + + +ChooseFn = Callable[[TopicNodeView, str], Optional[str]] + + +def place_concept( + concepts_root: Path, + stem: str, + brief: str, + content: str, + *, + choose: ChooseFn, + on_overflow: Optional[Callable[[Path], None]] = None, +) -> Path: + """Descend from the root, letting ``choose`` pick a child topic at each + level, until it returns None; drop the concept as a leaf there. + + Cost is O(depth) ``choose`` calls. ``on_overflow`` (if given) fires on + the landing node when its direct-child count exceeds ``FANOUT_K``. + """ + rel = "" + for _ in range(MAX_DEPTH): + view = read_topic(concepts_root, rel) + pick = choose(view, brief) + if pick is None: + break + if pick not in {t for t, _ in view.child_topics}: + break # choose returned a non-existent child; stop here defensively + rel = f"{rel}/{pick}" if rel else pick + node_dir = concepts_root if not rel else concepts_root / rel + node_dir.mkdir(parents=True, exist_ok=True) + path = node_dir / f"{stem}.md" + atomic_write_text(path, content) + if on_overflow is not None and child_count(node_dir) > FANOUT_K: + on_overflow(node_dir) + return path + + +ClusterFn = Callable[[list[tuple[str, str]]], dict[str, list[str]]] +SummarizeFn = Callable[[str, list[str]], str] + + +def split_node(node_dir: Path, *, cluster: ClusterFn, summarize: SummarizeFn) -> None: + """Cluster a node's direct concept leaves into subtopics and move them in. + + Files are moved (``Path.replace``), not copied; because wikilinks resolve + by bare stem, links to moved concepts keep resolving. + """ + view = read_topic(node_dir.parent if node_dir.name else node_dir, node_dir.name) + leaves = {stem: brief for stem, brief in view.child_concepts} + if not leaves: + return + groups = cluster(list(leaves.items())) + for sub_name, stems in groups.items(): + if not stems: + continue + sub_dir = node_dir / sub_name + sub_dir.mkdir(parents=True, exist_ok=True) + write_topic_md( + sub_dir, summarize(sub_name, [leaves.get(s, "") for s in stems]), len(stems) + ) + for stem in stems: + src = node_dir / f"{stem}.md" + if src.is_file(): + src.replace(sub_dir / f"{stem}.md") + # refresh the split node's own summary/size + new_view = read_topic(node_dir.parent if node_dir.name else node_dir, node_dir.name) + size = len(new_view.child_topics) + len(new_view.child_concepts) + write_topic_md(node_dir, view.summary or node_dir.name, size) + + +def place_topic_dir(concepts_root: Path, *, brief: str, choose: ChooseFn) -> Path: + """Descend with ``choose`` and return the landing topic directory WITHOUT + writing a concept file. Lets the caller own the concept-page format (e.g. + the compiler's ``_write_concept``) while the tree owns placement.""" + rel = "" + for _ in range(MAX_DEPTH): + view = read_topic(concepts_root, rel) + pick = choose(view, brief) + if pick is None or pick not in {t for t, _ in view.child_topics}: + break + rel = f"{rel}/{pick}" if rel else pick + node = concepts_root if not rel else concepts_root / rel + node.mkdir(parents=True, exist_ok=True) + return node + + +def _build_subtree( + node_dir: Path, + items: list[tuple[str, str, str]], # (stem, brief, content) + cluster: ClusterFn, + summarize: SummarizeFn, + depth: int, +) -> int: + """Recursively build a topic subtree under ``node_dir`` (already created, + with its ``_topic.md``). Clusters the full item set at this level, recurses + into any group still larger than ``FANOUT_K``, and writes leaves otherwise.""" + if len(items) <= FANOUT_K or depth >= MAX_DEPTH: + for stem, _brief_, content in items: + atomic_write_text(node_dir / f"{stem}.md", content) + return len(items) + + briefs = {stem: b for stem, b, _ in items} + contents = {stem: c for stem, _, c in items} + groups = cluster([(s, briefs[s]) for s in contents]) + + placed = 0 + seen: set[str] = set() + for name, stems in groups.items(): + kept = [s for s in stems if s in contents and s not in seen] + if not kept: + continue + seen.update(kept) + sub = node_dir / name + sub.mkdir(parents=True, exist_ok=True) + write_topic_md(sub, summarize(name, [briefs[s] for s in kept]), len(kept)) + placed += _build_subtree( + sub, [(s, briefs[s], contents[s]) for s in kept], cluster, summarize, depth + 1 + ) + # Any concept the clusterer dropped stays as a leaf at this node. + for s in [s for s in contents if s not in seen]: + atomic_write_text(node_dir / f"{s}.md", contents[s]) + placed += 1 + return placed + + +def bootstrap( + concepts_root: Path, + *, + cluster: ClusterFn, + summarize: SummarizeFn, +) -> int: + """Build a topic tree over the existing flat concepts under ``concepts_root``. + + Top-down, global cold-start seed: cluster the FULL concept set into top + topics, recurse into any topic still over ``FANOUT_K``. Building from the + whole set (rather than greedily one-by-one) avoids freezing the high-level + taxonomy on early-arriving concepts. Returns the number placed. + """ + concepts_root.mkdir(parents=True, exist_ok=True) + # Deterministic order; read all into memory, then clear the flat root. + flat = sorted(p for p in concepts_root.glob("*.md") if p.name != TOPIC_FILE) + items = [(p.stem, _brief(p), p.read_text(encoding="utf-8")) for p in flat] + for p in flat: + p.unlink() + if not (concepts_root / TOPIC_FILE).exists(): + write_topic_md(concepts_root, "Knowledge base topics.", 0) + return _build_subtree(concepts_root, items, cluster, summarize, depth=0) diff --git a/openkb/topic_tree_llm.py b/openkb/topic_tree_llm.py new file mode 100644 index 00000000..2769cb6a --- /dev/null +++ b/openkb/topic_tree_llm.py @@ -0,0 +1,89 @@ +"""LLM-backed decision callables for the topic-tree engine. + +These are injected into the pure engine (``openkb.topic_tree``) so the engine +stays unit-testable without a network. Production code wires these in. +""" +from __future__ import annotations + +import json + +from openkb.agent.compiler import _JSON_RESPONSE_FORMAT, _llm_call +from openkb.topic_tree import FANOUT_K, TopicNodeView + +_CHOOSE = ( + "You are placing a new concept into a topic tree. Given the current node's " + "summary and its child topics, choose the ONE child topic the concept best " + "belongs under, or null to keep it at this node. " + 'Reply JSON: {{"pick": }}.\n\n' + "Node summary: {summary}\nChild topics:\n{topics}\n\nConcept: {brief}" +) + + +def make_choose(model: str): + def choose(view: TopicNodeView, brief: str): + topics = "\n".join(f"- {n}: {s}" for n, s in view.child_topics) or "(none)" + raw = _llm_call( + model, + [{"role": "user", "content": _CHOOSE.format( + summary=view.summary, topics=topics, brief=brief)}], + "topic-choose", + response_format=_JSON_RESPONSE_FORMAT, + ) + pick = (json.loads(raw) or {}).get("pick") + valid = {n for n, _ in view.child_topics} + return pick if pick in valid else None + + return choose + + +_CLUSTER = ( + "Cluster these concepts into 2-{kmax} coherent subtopics. Reply JSON: " + '{{"groups": {{"": ["", ...]}}}}. ' + "Every stem must appear exactly once.\n\nConcepts:\n{items}" +) + + +def make_cluster(model: str): + def cluster(items): + listing = "\n".join(f"- {stem}: {brief}" for stem, brief in items) + raw = _llm_call( + model, + [{"role": "user", "content": _CLUSTER.format( + kmax=max(2, FANOUT_K // 2), items=listing)}], + "topic-cluster", + response_format=_JSON_RESPONSE_FORMAT, + ) + groups = (json.loads(raw) or {}).get("groups", {}) + known = {s for s, _ in items} + seen: set[str] = set() + clean: dict[str, list[str]] = {} + for name, stems in groups.items(): + kept = [s for s in stems if s in known and s not in seen] + seen.update(kept) + if kept: + clean[name] = kept + missing = [s for s in known if s not in seen] + if missing: + clean.setdefault("misc", []).extend(missing) + return clean + + return cluster + + +_SUMMARIZE = ( + 'Write a one-paragraph summary of the subtopic "{name}" that abstracts ' + "these concept briefs:\n{briefs}" +) + + +def make_summarize(model: str): + def summarize(name: str, briefs: list[str]) -> str: + raw = _llm_call( + model, + [{"role": "user", "content": _SUMMARIZE.format( + name=name, briefs="\n".join(f"- {b}" for b in briefs))}], + "topic-summary", + ) + return raw.strip() + + return summarize diff --git a/tests/test_compiler_topic_tree.py b/tests/test_compiler_topic_tree.py new file mode 100644 index 00000000..02724f24 --- /dev/null +++ b/tests/test_compiler_topic_tree.py @@ -0,0 +1,29 @@ +from openkb.agent.compiler import _write_concept +from openkb import topic_tree as tt + + +def test_write_concept_into_topic_dir(tmp_path): + wiki = tmp_path / "wiki" + _write_concept( + wiki, + "self-attention", + "# self-attention\n", + "summaries/doc.md", + is_update=False, + brief="q attends k", + topic_dir=wiki / "concepts" / "attention", + ) + assert (wiki / "concepts" / "attention" / "self-attention.md").is_file() + assert not (wiki / "concepts" / "self-attention.md").exists() # not flat + + +def test_place_topic_dir_descends(tmp_path): + root = tmp_path / "concepts" + tt.write_topic_md(root, "root", 0) + tt.write_topic_md(root / "attention", "att", 0) + node = tt.place_topic_dir( + root, brief="q", choose=lambda v, b: "attention" if v.child_topics else None + ) + assert node == root / "attention" + # descent only — no concept leaf written here, just the topic file + assert [p.name for p in node.glob("*.md")] == ["_topic.md"] diff --git a/tests/test_query_topic_tool.py b/tests/test_query_topic_tool.py new file mode 100644 index 00000000..254498e8 --- /dev/null +++ b/tests/test_query_topic_tool.py @@ -0,0 +1,16 @@ +from openkb.agent.tools import read_topic_node +from openkb import topic_tree as tt + + +def test_read_topic_node_renders(tmp_path): + wiki = tmp_path / "wiki" + root = wiki / "concepts" + tt.write_topic_md(root, "root summary", 1) + tt.write_topic_md(root / "attention", "attention summary", 1) + (root / "attention" / "self-attention.md").write_text( + '---\ntype: "Concept"\ndescription: "q attends k"\n---\n', encoding="utf-8" + ) + out = read_topic_node("attention", str(wiki)) + assert "attention summary" in out + assert "self-attention" in out + assert "q attends k" in out diff --git a/tests/test_reindex_cli.py b/tests/test_reindex_cli.py new file mode 100644 index 00000000..159305aa --- /dev/null +++ b/tests/test_reindex_cli.py @@ -0,0 +1,36 @@ +from unittest.mock import patch + +from click.testing import CliRunner + +from openkb.cli import cli + + +def _kb(tmp_path, topic_tree=True): + kb = tmp_path / "kb" + (kb / ".openkb").mkdir(parents=True) + (kb / "wiki" / "concepts").mkdir(parents=True) + (kb / "wiki" / "concepts" / "a.md").write_text("# a\n", encoding="utf-8") + flag = "topic_tree: true\n" if topic_tree else "" + (kb / ".openkb" / "config.yaml").write_text( + f"model: gpt-5.4\n{flag}", encoding="utf-8" + ) + return kb + + +def test_reindex_runs_when_enabled(tmp_path): + kb = _kb(tmp_path, topic_tree=True) + with patch("openkb.cli.tt_bootstrap", return_value=1) as boot, \ + patch("openkb.cli._setup_llm_key"): + res = CliRunner().invoke(cli, ["--kb-dir", str(kb), "reindex"]) + assert res.exit_code == 0, res.output + assert boot.called + assert "1" in res.output + + +def test_reindex_noop_when_disabled(tmp_path): + kb = _kb(tmp_path, topic_tree=False) + with patch("openkb.cli.tt_bootstrap") as boot, patch("openkb.cli._setup_llm_key"): + res = CliRunner().invoke(cli, ["--kb-dir", str(kb), "reindex"]) + assert res.exit_code == 0, res.output + assert not boot.called + assert "topic_tree" in res.output diff --git a/tests/test_topic_tree.py b/tests/test_topic_tree.py new file mode 100644 index 00000000..b8b92abb --- /dev/null +++ b/tests/test_topic_tree.py @@ -0,0 +1,139 @@ +from pathlib import Path +from unittest.mock import patch + +from openkb import topic_tree as tt + + +def _concept(d: Path, stem: str, brief: str): + d.mkdir(parents=True, exist_ok=True) + (d / f"{stem}.md").write_text( + f'---\ntype: "Concept"\ndescription: "{brief}"\n---\n# {stem}\n', + encoding="utf-8", + ) + + +def test_write_and_read_topic(tmp_path): + root = tmp_path / "concepts" + sub = root / "attention" + tt.write_topic_md(sub, summary="All about attention.", size=2) + _concept(sub, "self-attention", "queries attend to keys") + _concept(sub, "multi-head", "parallel attention heads") + view = tt.read_topic(root, "attention") + assert view.summary == "All about attention." + stems = {s for s, _ in view.child_concepts} + assert stems == {"self-attention", "multi-head"} + assert view.child_topics == [] + assert tt.child_count(sub) == 2 + + +def test_place_descends_then_drops(tmp_path): + root = tmp_path / "concepts" + tt.write_topic_md(root, "root", 0) + tt.write_topic_md(root / "attention", "attention topic", 0) + calls = [] + + def choose(view, brief): + calls.append([t for t, _ in view.child_topics]) + return "attention" if any(t == "attention" for t, _ in view.child_topics) else None + + path = tt.place_concept( + root, "self-attention", "q attends k", "# self-attention\n", choose=choose + ) + assert path == (root / "attention" / "self-attention.md") + assert path.read_text(encoding="utf-8") == "# self-attention\n" + assert calls == [["attention"], []] # descended once, then stopped at leaf node + + +def test_place_triggers_overflow(tmp_path): + root = tmp_path / "concepts" + tt.write_topic_md(root, "root", 0) + for i in range(tt.FANOUT_K): + (root / f"c{i}.md").write_text("x", encoding="utf-8") + fired = [] + tt.place_concept( + root, "c-extra", "b", "x", choose=lambda v, b: None, + on_overflow=lambda d: fired.append(d), + ) + assert fired == [root] # K existing + 1 new > FANOUT_K + + +def test_split_clusters_and_moves(tmp_path): + root = tmp_path / "concepts" + tt.write_topic_md(root, "root", 0) + for s in ("self-attention", "multi-head", "adam", "warmup"): + (root / f"{s}.md").write_text(f"# {s}\n", encoding="utf-8") + + def cluster(items): + return { + "attention": ["self-attention", "multi-head"], + "training": ["adam", "warmup"], + } + + tt.split_node(root, cluster=cluster, summarize=lambda n, b: f"summary of {n}") + assert (root / "attention" / "self-attention.md").is_file() + assert (root / "training" / "adam.md").is_file() + assert not (root / "self-attention.md").exists() # moved, not copied + assert tt.read_topic(root, "attention").summary == "summary of attention" + + +def _half_cluster(items): + """Deterministic fake: split the items into two halves (a/b).""" + half = len(items) // 2 + return { + "a": [s for s, _ in items[:half]], + "b": [s for s, _ in items[half:]], + } + + +def test_bootstrap_topdown_global(tmp_path): + """Bootstrap clusters the FULL set top-down (global view), not one-by-one.""" + root = tmp_path / "concepts" + root.mkdir(parents=True) + stems = [f"a{i}" for i in range(6)] + [f"b{i}" for i in range(6)] # 12 > FANOUT_K + for s in stems: + (root / f"{s}.md").write_text(f"# {s}\n", encoding="utf-8") + + def cluster(items): # clean global split by prefix + return { + "group-a": [s for s, _ in items if s.startswith("a")], + "group-b": [s for s, _ in items if s.startswith("b")], + } + + n = tt.bootstrap(root, cluster=cluster, summarize=lambda name, briefs: f"s {name}") + assert n == 12 + assert (root / "_topic.md").exists() + assert (root / "group-a" / "a0.md").is_file() + assert (root / "group-b" / "b0.md").is_file() + assert not (root / "a0.md").exists() # nothing left flat at root + assert tt.read_topic(root, "group-a").summary == "s group-a" + + +def test_bootstrap_recurses_until_under_fanout(tmp_path): + """A group still larger than FANOUT_K recurses into sub-topics (depth grows).""" + root = tmp_path / "concepts" + root.mkdir(parents=True) + n_concepts = tt.FANOUT_K * 2 + 5 # forces >1 level of recursion + for i in range(n_concepts): + (root / f"c{i:02d}.md").write_text(f"# c{i}\n", encoding="utf-8") + n = tt.bootstrap(root, cluster=_half_cluster, summarize=lambda name, briefs: "s") + assert n == n_concepts + # at least one subtopic that itself has a subtopic (depth >= 2) + deep = [d for d in root.rglob("*") if d.is_dir() and any(c.is_dir() for c in d.iterdir())] + assert deep, "expected a multi-level tree (a subtopic containing subtopics)" + # every leaf node holds <= FANOUT_K concepts + for d in root.rglob("*"): + if d.is_dir(): + assert tt.child_count(d) <= tt.FANOUT_K + + +def test_make_choose_parses_pick(tmp_path): + from openkb import topic_tree_llm as ttl + + root = tmp_path / "concepts" + tt.write_topic_md(root, "root", 0) + tt.write_topic_md(root / "attention", "att", 0) + view = tt.read_topic(root, "") + with patch.object(ttl, "_llm_call", return_value='{"pick": "attention"}'): + assert ttl.make_choose("gpt-5.4")(view, "q attends k") == "attention" + with patch.object(ttl, "_llm_call", return_value='{"pick": null}'): + assert ttl.make_choose("gpt-5.4")(view, "x") is None diff --git a/tests/test_topic_tree_e2e.py b/tests/test_topic_tree_e2e.py new file mode 100644 index 00000000..beb34c1d --- /dev/null +++ b/tests/test_topic_tree_e2e.py @@ -0,0 +1,53 @@ +"""Full-stack topic-tree integration (deterministic, no network). + +Exercises the path a real ``openkb reindex`` takes — bootstrap a flat wiki of +concepts into a tree, then verify links survive the moves and the query tool +can navigate it — using injected deterministic callables instead of an LLM. +""" +from pathlib import Path + +from openkb import topic_tree as tt +from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks +from openkb.agent.tools import read_topic_node + + +def _concept(wiki: Path, stem: str, brief: str, links=()): + d = wiki / "concepts" + d.mkdir(parents=True, exist_ok=True) + body_links = " ".join(f"[[{l}]]" for l in links) + (d / f"{stem}.md").write_text( + f'---\ntype: "Concept"\ndescription: "{brief}"\n---\n# {stem}\n{body_links}\n', + encoding="utf-8", + ) + + +def test_reindex_builds_tree_links_survive_and_navigable(tmp_path): + wiki = tmp_path / "wiki" + stems = [f"concept-{i:02d}" for i in range(15)] + for s in stems: + _concept(wiki, s, f"brief for {s}", links=["concept-00"]) # all reference concept-00 + + n = tt.bootstrap( + wiki / "concepts", + cluster=lambda items: { + "group-a": [s for s, _ in items[: len(items) // 2]], + "group-b": [s for s, _ in items[len(items) // 2:]], + }, + summarize=lambda name, briefs: f"summary of {name}", + ) + assert n == 15 + + # 1. a multi-level tree was built (at least one subtopic directory) + assert any(d.is_dir() for d in (wiki / "concepts").iterdir()) + + # 2. concept-00 was moved into a subtopic, yet the [[concept-00]] links survive + assert not (wiki / "concepts" / "concept-00.md").exists() + targets = list_existing_wiki_targets(wiki) + out, ghosts = strip_ghost_wikilinks("see [[concept-00]]", targets) + assert ghosts == [] + assert "[[concept-00]]" in out + + # 3. the query tool can navigate from the root + root_render = read_topic_node("", str(wiki)) + assert "child topics" in root_render + assert "group-a" in root_render diff --git a/tests/test_topic_tree_links.py b/tests/test_topic_tree_links.py new file mode 100644 index 00000000..b2eb7650 --- /dev/null +++ b/tests/test_topic_tree_links.py @@ -0,0 +1,53 @@ +from pathlib import Path + +from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks +from openkb import topic_tree as tt + + +def _mk(p: Path, text: str = "x"): + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(text, encoding="utf-8") + + +def test_targets_include_bare_stem_for_nested_concept(tmp_path): + wiki = tmp_path / "wiki" + _mk(wiki / "concepts" / "attention-and-transformers" / "self-attention.md") + targets = list_existing_wiki_targets(wiki) + assert "self-attention" in targets # bare stem resolves + assert "concepts/attention-and-transformers/self-attention" in targets + + +def test_dir_prefixed_link_resolves_for_nested_concept(tmp_path): + """Compiler-generated ``[[concepts/]]`` links must still resolve after + a concept is nested under a topic dir (the form real concept bodies use).""" + wiki = tmp_path / "wiki" + _mk(wiki / "concepts" / "transformer" / "self-attention.md") + targets = list_existing_wiki_targets(wiki) + assert "concepts/self-attention" in targets + out, ghosts = strip_ghost_wikilinks("see [[concepts/self-attention]]", targets) + assert ghosts == [] + assert "[[concepts/self-attention]]" in out + + +def test_bare_stem_link_not_stripped_when_nested(tmp_path): + wiki = tmp_path / "wiki" + _mk(wiki / "concepts" / "topic" / "self-attention.md") + targets = list_existing_wiki_targets(wiki) + out, ghosts = strip_ghost_wikilinks("see [[self-attention]]", targets) + assert ghosts == [] # link survives despite living in a subfolder + assert "[[self-attention]]" in out + + +def test_link_resolves_after_split_move(tmp_path): + wiki = tmp_path / "wiki" + root = wiki / "concepts" + tt.write_topic_md(root, "root", 0) + (root / "self-attention.md").write_text("# self-attention\n", encoding="utf-8") + tt.split_node( + root, + cluster=lambda items: {"attention": ["self-attention"]}, + summarize=lambda n, b: "s", + ) + targets = list_existing_wiki_targets(wiki) + out, ghosts = strip_ghost_wikilinks("see [[self-attention]]", targets) + assert ghosts == [] # bare-stem link still resolves after the move