From 898e3ef0be47b6245912a003b26198334db6dfa7 Mon Sep 17 00:00:00 2001 From: strictshot Date: Fri, 12 Jun 2026 21:05:41 -0400 Subject: [PATCH 1/2] ticket #3 cs-assistant: Add :stats and verbose mode to the dev CLI - fixed some reformatting --- README.md | 7 ++++++ src/apps/dev_cli.py | 35 +++++++++++++++++++++++++++++ src/infrastructure/db/repository.py | 18 +++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/README.md b/README.md index 5e56947..e474d9e 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,13 @@ You should see an answer followed by a `Sources:` block listing the URLs used. ## Using it +While inside the interactive CLI (`ask>`), you can use the following commands to control the session and view metadata: +| Command | Description | +| :--- | :--- | +| `:stats` | Toggles the display of performance metrics (e.g., token count, response time) for subsequent prompts. | +| `:verbose` | Toggles verbose mode, displaying detailed internal logs, thought processes, or API interactions. | +| `exit` or `quit` | Safely terminates the interactive session and returns to your terminal shell. (You can also use `Ctrl-D`). | + A grounded answer looks like this: ``` diff --git a/src/apps/dev_cli.py b/src/apps/dev_cli.py index 3245537..324d186 100644 --- a/src/apps/dev_cli.py +++ b/src/apps/dev_cli.py @@ -4,6 +4,7 @@ from src.config.logger import get_logger from src.infrastructure.db import async_session_factory from src.infrastructure.db.repository import Repository +from src.retrieval.services import retrieval_service log = get_logger(__name__) @@ -19,13 +20,33 @@ async def _check_db() -> None: async def _repl() -> None: await _check_db() + verbose = False # flag for :verbose + print("cs-assistant dev CLI. Type 'exit' or Ctrl-D to quit.\n") + print("Type ':stats' or ':verbose' for cmds.\n") + while True: try: question = input("ask> ").strip() except (EOFError, KeyboardInterrupt): print("\nbye") return + + # :stats cmd + if question.lower() in {":stats"}: + async with async_session_factory() as session: + count_sources, count_chunks = await Repository.get_source_and_chunk_counts(session) + print(f"{count_sources} sources, {count_chunks} chunks loaded") + await _check_db() + continue + + # :verbose cmd + if question.lower() in {":verbose"}: + verbose = not verbose + print(f"Verbose mode: {'ON' if verbose else 'OFF'}") + continue + + # exit/quit cmd if question.lower() in {"exit", "quit"}: return if not question: @@ -37,6 +58,20 @@ async def _repl() -> None: print(f"\nError: {e}\n") continue + # printing out chunk content (verbose mode) + if verbose: + retrieved_chunks = await retrieval_service.get_relevant_chunks(question) + for chunk_item in retrieved_chunks: + source_url = chunk_item.chunk.source_url + similarity_score = chunk_item.score + snippet = " ".join( + (chunk_item.chunk.content.split())[:250] + ) # snippet ~250 words (maybe chars instead?) + print(f"URL: {source_url}") + print(f"Similarity score: {similarity_score}") + print(f"Content snippet: {snippet}") + print("-" * 60) + print(f"\n{answer.text}\n") if answer.sources: print("Sources:") diff --git a/src/infrastructure/db/repository.py b/src/infrastructure/db/repository.py index 0dc8571..38c4538 100644 --- a/src/infrastructure/db/repository.py +++ b/src/infrastructure/db/repository.py @@ -14,6 +14,24 @@ async def has_chunks(session: AsyncSession) -> bool: result = await session.execute(select(ChunkRow.id).limit(1)) return result.scalar_one_or_none() is not None + @staticmethod + async def get_source_and_chunk_counts(session: AsyncSession) -> tuple[int, int]: + count_sources = await Repository.count_sources(session) + count_chunks = await Repository.count_chunks(session) + return count_sources, count_chunks + + @staticmethod + async def count_chunks(session: AsyncSession) -> int: + result = await session.execute(select(func.count(ChunkRow.id))) + # if above doesn't work properly + # result = await session.execute(select(func.count().select_from(ChunkRow))) + return result.scalar_one() + + @staticmethod + async def count_sources(session: AsyncSession) -> int: + result = await session.execute(select(func.count(SourceRow.id))) + return result.scalar_one() + @staticmethod async def get_or_create_source( session: AsyncSession, *, name: str, url: str, source_type: str From f7630b23b3e2eac8bf94a1f4cc16aa1969d6a2fe Mon Sep 17 00:00:00 2001 From: strictshot Date: Fri, 19 Jun 2026 16:46:07 -0400 Subject: [PATCH 2/2] Heeded and implemented PR comments. --- README.md | 4 ++-- src/apps/dev_cli.py | 26 ++++++++++++-------- tests/infrastructure/db/test_repository.py | 28 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e474d9e..78562ae 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,8 @@ You should see an answer followed by a `Sources:` block listing the URLs used. While inside the interactive CLI (`ask>`), you can use the following commands to control the session and view metadata: | Command | Description | | :--- | :--- | -| `:stats` | Toggles the display of performance metrics (e.g., token count, response time) for subsequent prompts. | -| `:verbose` | Toggles verbose mode, displaying detailed internal logs, thought processes, or API interactions. | +| `:stats` | Quick check on how many sources and chunks are loaded in the DB. | +| `:verbose` | Toggles verbose mode. On every query, it'll dump the retrieved chunks (URLs, match scores, and snippets) right before the response. | | `exit` or `quit` | Safely terminates the interactive session and returns to your terminal shell. (You can also use `Ctrl-D`). | A grounded answer looks like this: diff --git a/src/apps/dev_cli.py b/src/apps/dev_cli.py index 324d186..68d1cb0 100644 --- a/src/apps/dev_cli.py +++ b/src/apps/dev_cli.py @@ -18,8 +18,19 @@ async def _check_db() -> None: ) +async def _print_db_status(): + async with async_session_factory() as session: + count_sources, count_chunks = await Repository.get_source_and_chunk_counts(session) + if count_chunks == 0: + print( + "WARNING: The database has no chunks. Run `make ingest` first, " + "or your questions will all be answered with 'I don't know'.\n" + ) + print(f"{count_sources} sources, {count_chunks} chunks loaded") + + async def _repl() -> None: - await _check_db() + await _print_db_status() verbose = False # flag for :verbose print("cs-assistant dev CLI. Type 'exit' or Ctrl-D to quit.\n") @@ -32,12 +43,9 @@ async def _repl() -> None: print("\nbye") return - # :stats cmd + # :stats cmd [THIS NEEDS FIXING/REFACTORING] if question.lower() in {":stats"}: - async with async_session_factory() as session: - count_sources, count_chunks = await Repository.get_source_and_chunk_counts(session) - print(f"{count_sources} sources, {count_chunks} chunks loaded") - await _check_db() + await _print_db_status() continue # :verbose cmd @@ -64,9 +72,7 @@ async def _repl() -> None: for chunk_item in retrieved_chunks: source_url = chunk_item.chunk.source_url similarity_score = chunk_item.score - snippet = " ".join( - (chunk_item.chunk.content.split())[:250] - ) # snippet ~250 words (maybe chars instead?) + snippet = chunk_item.chunk.content[:250] + "[...]" print(f"URL: {source_url}") print(f"Similarity score: {similarity_score}") print(f"Content snippet: {snippet}") @@ -76,7 +82,7 @@ async def _repl() -> None: if answer.sources: print("Sources:") for source in answer.sources: - print(f" - {source.url}") + print(f"{source.url}") print() diff --git a/tests/infrastructure/db/test_repository.py b/tests/infrastructure/db/test_repository.py index caff928..bec9519 100644 --- a/tests/infrastructure/db/test_repository.py +++ b/tests/infrastructure/db/test_repository.py @@ -119,3 +119,31 @@ async def test_upsert_idempotent_and_updates_on_conflict(session: AsyncSession): ) row = row_result.scalar_one() assert row.content == "version 2", "Upsert should update content on conflict" + + +async def test_get_source_and_chunk_counts(session: AsyncSession): + initial_sources, initial_chunks = await Repository.get_source_and_chunk_counts(session) + + source = await Repository.get_or_create_source( + session, name="Test Source", url="https://example.com/test", source_type="html" + ) + + current_sources, current_chunks = await Repository.get_source_and_chunk_counts(session) + assert current_sources == initial_sources + 1 + assert current_chunks == initial_chunks + embedding = [0.1] * settings.embedding_dim + + await Repository.upsert_chunk( + session, + content="Test content about Carleton CS", + embedding=embedding, + source_url="https://example.com/test", + source_type="html", + section_heading="Overview", + content_hash="hash_query_test_001", + source_id=source.id, + ) + + final_sources, final_chunks = await Repository.get_source_and_chunk_counts(session) + assert final_sources == initial_sources + 1 + assert final_chunks == initial_chunks + 1