From a538a30ccc0e2441966be7c0b0d245ffbca18f21 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Sun, 17 May 2026 18:57:38 -0700 Subject: [PATCH 01/10] FIX: RTD build - remove broken TOC entries, fix AML image paths, build PDF in CI The Read the Docs build for PyRIT has been failing silently because: 1. doc/myst.yml referenced two API pages that do not exist: - api/pyrit_setup_initializers.md - gen_api_md.py does not emit a separate page for pyrit.setup.initializers (its parent pyrit.setup has its own API members, so the script doesn't expand into submodules) - api/pyrit_ui.md - the pyrit/ui/ module doesn't exist Both entries produced RTD-fatal 'Table of contents entry does not exist' errors (added in #1469 and #1472 in March 2026). 2. The AML troubleshooting notebooks under doc/getting_started/troubleshooting/ referenced images via ./../../assets/aml_*.png which resolves to doc/assets/ (a directory that doesn't exist). The paired .py files correctly used ./../../../assets/ (3 ../, resolving to the repo-root assets/ where the images actually live). The .ipynb / .py pair was out of sync. The missing images caused xdvipdfmx to abort during the PDF export on RTD. 3. The build-book GitHub Actions workflow only ran 'jupyter-book build --all --html' (HTML-only), so PDF-only regressions silently slipped past CI while RTD failed. This change: - removes the two missing TOC entries from doc/myst.yml - syncs the .ipynb AML image paths to ./../../../assets/ matching the .py files - adds a 'docs-build-all' Makefile target that runs HTML + PDF together (mirroring RTD's 'jupyter-book build --all' behaviour) - updates .github/workflows/docs.yml to install texlive-xetex / latexmk and use 'make docs-build-all' so PDF regressions are caught in CI Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/docs.yml | 18 ++++++++++++++++-- Makefile | 13 ++++++++++++- .../troubleshooting/deploy_hf_model_aml.ipynb | 6 +++--- .../download_and_register_hf_model_aml.ipynb | 2 +- .../troubleshooting/score_aml_endpoint.ipynb | 8 ++++---- doc/myst.yml | 3 --- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 78ddda87f1..c8347a2943 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -55,10 +55,24 @@ jobs: - name: Install PyRIT with uv run: uv sync --extra all - # Build the book + # LaTeX toolchain needed for the PDF export (jupyter-book --pdf via xelatex). + # Mirrors the ReadTheDocs build so CI catches PDF-only issues (e.g. broken + # image paths) instead of letting them silently break the RTD build. + - name: Install LaTeX (for PDF export) + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + texlive-xetex \ + texlive-fonts-recommended \ + texlive-fonts-extra \ + texlive-plain-generic \ + texlive-latex-extra \ + latexmk + + # Build the book (HTML site + PDF export) - name: Build the book run: | - make docs-build + make docs-build-all # Upload the book's HTML as an artifact - name: Upload artifact uses: actions/upload-pages-artifact@v3 diff --git a/Makefile b/Makefile index f8b193bb76..d1de710acd 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ ty: # Build the full documentation site: # 1. Generate API reference JSON from Python source (griffe) # 2. Convert API JSON to MyST markdown pages -# 3. Build the Jupyter Book site +# 3. Build the Jupyter Book site (HTML only — fast, no LaTeX needed) # 4. Generate RSS feed docs-build: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json @@ -28,6 +28,17 @@ docs-build: cd doc && uv run jupyter-book build --all --html uv run ./build_scripts/generate_rss.py +# Build the full documentation site including the PDF export. +# Mirrors the ReadTheDocs build (.readthedocs.yaml) so CI catches PDF-only issues +# such as missing images that the HTML-only build silently ignores. +# Requires xelatex / latexmk on PATH (texlive-xetex + texlive-fonts-recommended + +# texlive-plain-generic + latexmk on Ubuntu). +docs-build-all: + uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json + uv run python build_scripts/gen_api_md.py + cd doc && uv run jupyter-book build --all --html --pdf + uv run ./build_scripts/generate_rss.py + # Regenerate only the API reference pages (without building the full site) docs-api: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json diff --git a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb index 84c09152d7..254bea3bd1 100644 --- a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb @@ -47,9 +47,9 @@ "\n", "5. **AZURE_ML_MODEL_NAME_TO_DEPLOY**\n", " - If the model is listed in the AZURE ML Hugging Face model catalog, then supply the model name as shown in the following image.\n", - "
\"AML
\n", + "
\"AML
\n", " - If you intend to deploy the model from the AZURE ML workspace model registry, then use the model name as shown in the subsequent image.\n", - "
\"AML
\n", + "
\"AML
\n", "6. **AZURE_ML_MODEL_VERSION_TO_DEPLOY**\n", " - You can find the details of the model version in the images from previous step associated with the respective model.\n", "\n", @@ -277,7 +277,7 @@ "**Add deployment to an Azure ML endpoint created above**\n", "\n", "Please be aware that deploying, particularly larger models, may take some time. Once the deployment is finished, the provisioning state will be marked as 'Succeeded', as illustrated in the image below.\n", - "
\"AML
" + "
\"AML
" ] }, { diff --git a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb index b56fb19719..700e9a9583 100644 --- a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb @@ -127,7 +127,7 @@ "\n", "9. **AZURE_ML_COMPUTE_NAME**\n", " - If you already have an Azure ML compute cluster, provide its name. If not, the script will create one based on the instance size and the specified minimum and maximum instances.\n", - "
\"AML
\n", + "
\"AML
\n", "\n", "10. **IDLE_TIME_BEFORE_SCALE_DOWN**\n", " - Set the duration for the Azure ML cluster to remain active before scaling down due to inactivity, ensuring efficient resource use. Typically, 3-4 hours is ideal for large size models.\n", diff --git a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb index bd146d3caa..cc13c0ba1b 100644 --- a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb +++ b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb @@ -25,15 +25,15 @@ "\n", "1. **AZURE_ML_SCORE_DEPLOYMENT_NAME**\n", " - This deployment name can be acquired from the Azure ML managed online endpoint, as illustrated in image below.\n", - "
\"AML
\n", + "
\"AML
\n", "\n", "2. **AZURE_ML_MANAGED_ENDPOINT**\n", " - To obtain the managed endpoint, navigate through the Azure ML workspace by selecting 'Launch Studio', then 'Endpoints' on the left side, followed by 'Consume'. Copy the REST endpoint as depicted below.\n", - "
\"AML
\n", + "
\"AML
\n", "\n", "3. **AZURE_ML_KEY**\n", " - Navigate through the Azure ML workspace by selecting 'Launch Studio', then 'Endpoints' on the left side, followed by 'Consume'. The primary key can be obtained as shown in the subsequent image.\n", - "
\"AML
\n", + "
\"AML
\n", "\n" ] }, @@ -80,7 +80,7 @@ "**Azure ML endpoint JSON body**\n", "\n", "The JSON body can be acquired by the following method: Access the Hugging Face model within the Azure ML model catalog by going to the workspace, then to the studio, selecting 'Model Catalog', and using the search bar to find the model ID. Open the model to view the sample input schema as shown in the image below.\n", - "
\"aml_model_endpoint_schema.png\"
\n", + "
\"aml_model_endpoint_schema.png\"
\n", "\n", "In addition, we have compiled the details of the request and response for the Hugging Face models hosted on the Azure Machine Learning (Azure ML) endpoint. Please review the [provided link](./hf_aml_model_endpoint_guide.md) to access the JSON request body and response for the Azure ML endpoint. Additionally, you can deduce the schema from the response if a bad request was sent to the inference endpoint." ] diff --git a/doc/myst.yml b/doc/myst.yml index f703d6c8cd..98112fd5ed 100644 --- a/doc/myst.yml +++ b/doc/myst.yml @@ -200,10 +200,7 @@ project: - file: api/pyrit_scenario.md - file: api/pyrit_score.md - file: api/pyrit_setup.md - children: - - file: api/pyrit_setup_initializers.md - file: api/pyrit_show_versions.md - - file: api/pyrit_ui.md - file: blog/README.md children: - file: blog/2026_04_14_scoring_scorers.md From ffb96994b951a3526c2d311c049aaf963fba04fe Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 05:33:22 -0700 Subject: [PATCH 02/10] DOC: Replace HTML anchors with MyST (name)= targets in converter notebooks The 4 converter notebooks (1_text_to_text, 2_audio, 3_image, 4_video) used '' HTML anchors to mark headings as link targets. MyST parses these as anchor-only links and reports them as 'Link has no URL' errors during the docs build. Replace all 8 anchors across the 4 .ipynb files (and their paired .py source files) with MyST-native (name)= target syntax placed immediately before the heading: -> (non-llm-converters)= ## Non-LLM Converters -> ## Non-LLM Converters Existing cross-reference links of the form [Text](#slug) continue to resolve correctly against both the explicit target and MyST's auto-generated heading slug, so no link rewrites are needed. The jupytext '"main_language": "python"' metadata key was added to 2_audio_converters.ipynb and 4_video_converters.ipynb by 'jupytext --update --to ipynb'. This is harmless drift fix between the .py jupytext header and the .ipynb metadata block. This unblocks issue #1741 (enabling '--strict' on the docs build to catch silent RTD breakage). The '--strict' flip itself is deferred: adding it now would surface ~30 additional pre-existing errors unrelated to the converter notebooks (auth-required external URLs, broken refs to renamed/deleted files, mystmd v2 LaTeX-node limitations on PDF export, missing heading depths in auto-generated API md). Each category needs its own decision (fix vs. suppress via 'error_rules' in myst.yml), which is out of scope for the prerequisite anchor fix this PR delivers. See the PR description for the full breakdown. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/code/converters/1_text_to_text_converters.ipynb | 4 ++-- doc/code/converters/1_text_to_text_converters.py | 4 ++-- doc/code/converters/2_audio_converters.ipynb | 9 +++++---- doc/code/converters/2_audio_converters.py | 6 +++--- doc/code/converters/3_image_converters.ipynb | 4 ++-- doc/code/converters/3_image_converters.py | 4 ++-- doc/code/converters/4_video_converters.ipynb | 5 ++++- doc/code/converters/4_video_converters.py | 2 +- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb index dfb2de1856..1e996efc5d 100644 --- a/doc/code/converters/1_text_to_text_converters.ipynb +++ b/doc/code/converters/1_text_to_text_converters.ipynb @@ -22,7 +22,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(non-llm-converters)=\n", "## Non-LLM Converters\n", "\n", "Non-LLM converters use deterministic algorithms to transform text. These include:\n", @@ -454,7 +454,7 @@ "id": "10", "metadata": {}, "source": [ - "\n", + "(llm-based-converters)=\n", "## LLM-Based Converters\n", "\n", "LLM-based converters use language models to transform prompts. These converters are more flexible and can produce more natural variations, but they are slower and require an LLM target.\n", diff --git a/doc/code/converters/1_text_to_text_converters.py b/doc/code/converters/1_text_to_text_converters.py index 67cc57919d..6237720753 100644 --- a/doc/code/converters/1_text_to_text_converters.py +++ b/doc/code/converters/1_text_to_text_converters.py @@ -22,7 +22,7 @@ # - **[LLM-Based Converters](#llm-based-converters)**: AI-powered transformations including translation, variation, and semantic modifications # %% [markdown] -# +# (non-llm-converters)= # ## Non-LLM Converters # # Non-LLM converters use deterministic algorithms to transform text. These include: @@ -225,7 +225,7 @@ print("Variation Selector:", await var_selector.convert_async(prompt=prompt)) # type: ignore # %% [markdown] -# +# (llm-based-converters)= # ## LLM-Based Converters # # LLM-based converters use language models to transform prompts. These converters are more flexible and can produce more natural variations, but they are slower and require an LLM target. diff --git a/doc/code/converters/2_audio_converters.ipynb b/doc/code/converters/2_audio_converters.ipynb index f125cba736..c5b199fd7e 100644 --- a/doc/code/converters/2_audio_converters.ipynb +++ b/doc/code/converters/2_audio_converters.ipynb @@ -23,7 +23,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(text-to-audio)=\n", "## Text to Audio\n", "\n", "The `AzureSpeechTextToAudioConverter` converts text input into audio output, generating spoken audio files." @@ -72,7 +72,7 @@ "id": "3", "metadata": {}, "source": [ - "\n", + "(audio-to-text)=\n", "## Audio to Text\n", "\n", "The `AzureSpeechAudioToTextConverter` transcribes audio files into text. Below we use the audio file created in the previous section." @@ -117,7 +117,7 @@ "id": "5", "metadata": {}, "source": [ - "\n", + "(audio-to-audio)=\n", "## Audio to Audio\n", "\n", "Audio-to-audio converters modify existing audio files. All of these converters accept `audio_path` input\n", @@ -240,7 +240,8 @@ ], "metadata": { "jupytext": { - "cell_metadata_filter": "-all" + "cell_metadata_filter": "-all", + "main_language": "python" }, "language_info": { "codemirror_mode": { diff --git a/doc/code/converters/2_audio_converters.py b/doc/code/converters/2_audio_converters.py index 127b5c2b15..7c226f0767 100644 --- a/doc/code/converters/2_audio_converters.py +++ b/doc/code/converters/2_audio_converters.py @@ -23,7 +23,7 @@ # - **[Audio to Audio](#audio-to-audio)**: Modify audio files (speed, volume, echo, frequency, noise) # %% [markdown] -# +# (text-to-audio)= # ## Text to Audio # # The `AzureSpeechTextToAudioConverter` converts text input into audio output, generating spoken audio files. @@ -45,7 +45,7 @@ assert os.path.exists(audio_convert_result.output_text) # %% [markdown] -# +# (audio-to-text)= # ## Audio to Text # # The `AzureSpeechAudioToTextConverter` transcribes audio files into text. Below we use the audio file created in the previous section. @@ -70,7 +70,7 @@ print(transcript) # %% [markdown] -# +# (audio-to-audio)= # ## Audio to Audio # # Audio-to-audio converters modify existing audio files. All of these converters accept `audio_path` input diff --git a/doc/code/converters/3_image_converters.ipynb b/doc/code/converters/3_image_converters.ipynb index 2e3277d34a..cf71440a31 100644 --- a/doc/code/converters/3_image_converters.ipynb +++ b/doc/code/converters/3_image_converters.ipynb @@ -22,7 +22,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(text-to-image)=\n", "## Text to Image\n", "\n", "### QRCodeConverter\n", @@ -148,7 +148,7 @@ "id": "5", "metadata": {}, "source": [ - "\n", + "(image-to-image)=\n", "## Image to Image\n", "\n", "### AddTextImageConverter\n", diff --git a/doc/code/converters/3_image_converters.py b/doc/code/converters/3_image_converters.py index 3bf7ecac80..846a468447 100644 --- a/doc/code/converters/3_image_converters.py +++ b/doc/code/converters/3_image_converters.py @@ -21,7 +21,7 @@ # - **[Image to Image](#image-to-image)**: Modify or transform existing images # %% [markdown] -# +# (text-to-image)= # ## Text to Image # # ### QRCodeConverter @@ -76,7 +76,7 @@ display(image) # %% [markdown] -# +# (image-to-image)= # ## Image to Image # # ### AddTextImageConverter diff --git a/doc/code/converters/4_video_converters.ipynb b/doc/code/converters/4_video_converters.ipynb index 12f675f8dd..53e3ed6bac 100644 --- a/doc/code/converters/4_video_converters.ipynb +++ b/doc/code/converters/4_video_converters.ipynb @@ -21,7 +21,7 @@ "id": "1", "metadata": {}, "source": [ - "\n", + "(image-to-video)=\n", "## Image to Video\n", "\n", "### AddImageVideoConverter\n", @@ -74,6 +74,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/code/converters/4_video_converters.py b/doc/code/converters/4_video_converters.py index c537193c1a..0d0aec6bc6 100644 --- a/doc/code/converters/4_video_converters.py +++ b/doc/code/converters/4_video_converters.py @@ -20,7 +20,7 @@ # - **[Image to Video](#image-to-video)**: Add images to video files # %% [markdown] -# +# (image-to-video)= # ## Image to Video # # ### AddImageVideoConverter From f41e1e5d4e0da014dfd5c0bdc6b88cf942207592 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 05:45:21 -0700 Subject: [PATCH 03/10] DOC: fix internal broken refs to renamed, moved, and deleted targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These were real broken links in the docs that the current `make docs-build` was silently emitting as MyST '⛔' errors (build still exited 0). With '--strict' enabled (next commit), they would become hard build failures, so fix the underlying refs: - `doc/blog/2026_04_14_scoring_scorers.md` (2 occurrences): `../code/scoring/8_scorer_metrics.ipynb` → `../code/scoring/7_scorer_metrics.ipynb` (notebook renumbered). - `doc/blog/2025_02_11.md`: `../code/scenarios/9_baseline_only.ipynb` → linked file was deleted; repoint to `../code/scenarios/0_scenarios.ipynb#baseline-execution` which now hosts the Baseline Execution section. - `doc/getting_started/troubleshooting/{deploy_hf_model_aml,download_and_register_hf_model_aml,score_aml_endpoint}.ipynb`: `../setup/populating_secrets.md` → `../populating_secrets.md`. Drift fix: each paired `.py` already had the correct path; the `.ipynb` halves were stale. Synced via `jupytext --update --to ipynb`. - `doc/contributing/5_unit_tests.md`: `../../tests/unit/target/test_tts_target.py` → `../../tests/unit/prompt_target/target/test_tts_target.py` (the test directory layout changed). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/blog/2025_02_11.md | 2 +- doc/blog/2026_04_14_scoring_scorers.md | 4 ++-- doc/contributing/5_unit_tests.md | 2 +- .../troubleshooting/deploy_hf_model_aml.ipynb | 5 ++++- .../download_and_register_hf_model_aml.ipynb | 9 +++++---- .../troubleshooting/score_aml_endpoint.ipynb | 9 +++++++-- 6 files changed, 20 insertions(+), 11 deletions(-) diff --git a/doc/blog/2025_02_11.md b/doc/blog/2025_02_11.md index a8abfe1ae1..248f84413a 100644 --- a/doc/blog/2025_02_11.md +++ b/doc/blog/2025_02_11.md @@ -32,6 +32,6 @@ See the updated documentation [here](../code/datasets/1_loading_datasets.ipynb). ## What else can we do with this? -Now that we've loaded our dataset into PyRIT as a `SeedPromptDataset` the really exciting red teaming can begin. A great example of this is in our [Baseline-Only Execution](../code/scenarios/9_baseline_only.ipynb) notebook! We can use the prompts to evaluate the target by sending all the previously loaded prompts, modifying which attacks to use, and storing the scores for further analysis. +Now that we've loaded our dataset into PyRIT as a `SeedPromptDataset` the really exciting red teaming can begin. A great example of this is the [Baseline Execution](../code/scenarios/0_scenarios.ipynb#baseline-execution) section of our scenarios overview! We can use the prompts to evaluate the target by sending all the previously loaded prompts, modifying which attacks to use, and storing the scores for further analysis. In this blog post, we've walked through how we use structured datasets through our `SeedPrompt` and `SeedPromptDataset` classes. PyRIT's architecture allows for customization at every stage - whether through converters or configuring different scorers - and seed prompts set us up to effectively probe for risks in AI systems. Send over any contributions to add more datasets, refine seed prompts, or any open issues on Github! Now that you understand a core component of PyRIT, go ahead and try it out! diff --git a/doc/blog/2026_04_14_scoring_scorers.md b/doc/blog/2026_04_14_scoring_scorers.md index 9e7eec0cbb..9358a8762b 100644 --- a/doc/blog/2026_04_14_scoring_scorers.md +++ b/doc/blog/2026_04_14_scoring_scorers.md @@ -108,7 +108,7 @@ flowchart TB There are a few different ways to view metrics for specific scoring configurations. -**Directly on a scorer instance:** Call `get_scorer_metrics()` on any scorer object to look up its saved metrics (if they exist), as described at the bottom of the [Scorer Evaluation Identifier](#scorer-evaluation-identifier) section above. See the [scorer metrics notebook](../code/scoring/8_scorer_metrics.ipynb) to try it yourself! +**Directly on a scorer instance:** Call `get_scorer_metrics()` on any scorer object to look up its saved metrics (if they exist), as described at the bottom of the [Scorer Evaluation Identifier](#scorer-evaluation-identifier) section above. See the [scorer metrics notebook](../code/scoring/7_scorer_metrics.ipynb) to try it yourself! **Automatically in scenario output:** When running scenarios and printing results (i.e., in [pyrit_scan](../scanner/1_pyrit_scan.ipynb) or [pyrit_shell](../scanner/2_pyrit_shell.md)), metrics are automatically fetched and displayed alongside the attack results (as long as the scoring configuration has been evaluated before): @@ -132,7 +132,7 @@ The framework checks the JSONL registry for an existing entry matching the score ![alt text](2026_04_14_running_evaluation.png) -For the full walkthrough — including running objective and harm evaluations, configuring custom datasets, and comparing results — give the [scorer metrics notebook](../code/scoring/8_scorer_metrics.ipynb) a try! +For the full walkthrough — including running objective and harm evaluations, configuring custom datasets, and comparing results — give the [scorer metrics notebook](../code/scoring/7_scorer_metrics.ipynb) a try! ## Closing Thoughts diff --git a/doc/contributing/5_unit_tests.md b/doc/contributing/5_unit_tests.md index 07eb74ce1a..b0d48ec879 100644 --- a/doc/contributing/5_unit_tests.md +++ b/doc/contributing/5_unit_tests.md @@ -13,4 +13,4 @@ Testing is an art to get right! But here are some best practices in terms of uni - Don't write to the actual database, use a `MagicMock` for the memory object or use `patch_central_database` as the database connection. -Not all of our current tests follow these practices (we're working on it!) But for some good examples, see [test_tts_send_prompt_file_save_async](../../tests/unit/target/test_tts_target.py), which has many of these best practices incorporated in the test. +Not all of our current tests follow these practices (we're working on it!) But for some good examples, see [test_tts_send_prompt_file_save_async](../../tests/unit/prompt_target/target/test_tts_target.py), which has many of these best practices incorporated in the test. diff --git a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb index 254bea3bd1..2b5600ce5a 100644 --- a/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/deploy_hf_model_aml.ipynb @@ -19,7 +19,7 @@ " pip install azure-ai-ml\n", " pip install azure-identity\n", " ```\n", - "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section in the markdown file provided [here](../setup/populating_secrets.md)\n", + "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section in the markdown file provided [here](../populating_secrets.md)\n", "- A Hugging Face model should be present in the AZURE ML model catalog. If it is missing, execute the [notebook](./download_and_register_hf_model_aml.ipynb) to download and register the Hugging Face model in the AZURE ML registry." ] }, @@ -311,6 +311,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb index 700e9a9583..29e7fdec00 100644 --- a/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb +++ b/doc/getting_started/troubleshooting/download_and_register_hf_model_aml.ipynb @@ -37,7 +37,7 @@ " pip install azure-ai-ml\n", " pip install azure-identity\n", " ```\n", - "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../setup/populating_secrets.md)" + "- Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../populating_secrets.md)" ] }, { @@ -66,9 +66,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "outputs": [], "source": [ "# Import the Azure ML SDK components required for workspace connection and model management.\n", @@ -503,6 +501,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb index cc13c0ba1b..76aa4135e5 100644 --- a/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb +++ b/doc/getting_started/troubleshooting/score_aml_endpoint.ipynb @@ -16,7 +16,7 @@ "Before proceeding with this notebook, ensure the following prerequisites are met:\n", "\n", "1. **Azure ML Model Deployment**: Your Azure ML model must be deployed to an Azure ML managed online endpoint. If your model is not yet deployed, please follow the instructions in the [deployment notebook](./deploy_hf_model_aml.ipynb).\n", - "2. Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../setup/populating_secrets.md)\n", + "2. Execute the `az login` command to sign in to your Azure subscription. For detailed instructions, refer to the \"Authenticate with Azure Subscription\" section [here](../populating_secrets.md)\n", "\n", "\n", "## Environment Variables\n", @@ -89,7 +89,9 @@ "cell_type": "code", "execution_count": null, "id": "4", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [ { "name": "stdout", @@ -173,6 +175,9 @@ } ], "metadata": { + "jupytext": { + "main_language": "python" + }, "language_info": { "codemirror_mode": { "name": "ipython", From a3c498e7f2a880d3ccef4e4b1d4f9b9e69c3201a Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 05:45:50 -0700 Subject: [PATCH 04/10] DOC: enable --strict on docs build with targeted error_rules suppressions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #1741. Append '--strict' to both 'docs-build' and 'docs-build-all' in the Makefile so MyST errors (broken links, missing TOC entries, malformed syntax, etc.) hard-fail CI instead of being silently emitted as '⛔' lines in a green build. This is how RTD silently broke before PR #1740. To make '--strict' actually pass today, add a small, well-documented 'error_rules' block to 'doc/myst.yml' that ignores three categories of errors which are inherently outside the project's control or intentionally non-resolvable: 1. 'tex-renders' (blanket): mystmd v2.x has no LaTeX renderer for the MyST layout directives the project uses extensively ('grid', 'tabSet', 'details', 'mermaid'). They render fine in HTML. PDF export emits 'Unhandled LaTeX conversion for node of X' per node. Remove when upstream support lands. 2. 'link-resolves' for auth-required external API docs: 'platform.openai.com/**', 'api.openai.com/**', 'cognitiveservices.azure.com/**'. These always return 401/403 from CI. URLs themselves are correct, just unverifiable without credentials. 3. 'link-resolves' for intentional placeholder URLs: 'pyrit.shared.foo' (style-guide example), 'account.blob.core.windows.net/...' (docstring placeholder shape), 'PyRIT/releases/vx.y.z/**' (release-process template that operators substitute at release time). 4. 'link-resolves' for stale URLs in immutable historical blog posts and one current memory doc whose external target moved ('PyRIT/.../pdf_converter.ipynb' was deleted, 'microsoft.github.io/PyRIT/**' old paths from before the v2 site restructure, 'dbeaver.com/.../sqlite.html' link rot). Marked TODO for editorial cleanup in a follow-up. Each suppression carries a comment explaining *why* the URL is suppressed and what would need to change to remove the suppression, so the allow-list stays honest. Verified locally: - 'cd doc && jupyter-book build --all --html --strict' → exit 0 - Same with a deliberately broken link added to 'doc/index.md': → exit 1, error reported, build stops. Confirms '--strict' is effective. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Makefile | 4 ++-- doc/myst.yml | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index d1de710acd..4b6755a9ca 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ ty: docs-build: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py - cd doc && uv run jupyter-book build --all --html + cd doc && uv run jupyter-book build --all --html --strict uv run ./build_scripts/generate_rss.py # Build the full documentation site including the PDF export. @@ -36,7 +36,7 @@ docs-build: docs-build-all: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py - cd doc && uv run jupyter-book build --all --html --pdf + cd doc && uv run jupyter-book build --all --html --pdf --strict uv run ./build_scripts/generate_rss.py # Regenerate only the API reference pages (without building the full site) diff --git a/doc/myst.yml b/doc/myst.yml index 98112fd5ed..80361fe65c 100644 --- a/doc/myst.yml +++ b/doc/myst.yml @@ -11,6 +11,59 @@ project: - format: pdf template: plain_latex_book output: exports/book.pdf + # See https://mystmd.org/guide for error_rules schema. + # Rule IDs come from https://github.com/jupyter-book/mystmd + # (packages/myst-common/src/ruleids.ts). + error_rules: + # mystmd v2.x has no LaTeX renderer for the MyST layout directives we use + # extensively in the HTML site (grid, tabSet, details, mermaid). They + # render fine in HTML; PDF export emits "Unhandled LaTeX conversion for + # node of X" errors per node. Track upstream support and remove this + # suppression when those renderers land. + - rule: tex-renders + severity: ignore + + # External docs that require auth — always 401/403 from CI workers and + # public link checkers. The URLs themselves are correct; they just can't + # be verified without credentials. + - rule: link-resolves + severity: ignore + keys: + - "https://platform.openai.com/**" + - "https://api.openai.com/**" + - "http://api.openai.com" + - "http://api.openai.com/**" + - "https://cognitiveservices.azure.com/**" + + # Intentional placeholder URLs that are not meant to resolve: + # - pyrit.shared.foo is a fake example endpoint in the style guide. + # - account.blob.core.windows.net/container/... is a generic Azure + # Blob Storage URL shape used in pyrit.models docstrings. + # - PyRIT/releases/vx.y.z/... are template placeholders in + # contributing/10_release_process.md (operators substitute the + # real version at release time). + - rule: link-resolves + severity: ignore + keys: + - "http://pyrit.shared.foo" + - "https://account.blob.core.windows.net/**" + - "https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/**" + - "https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/**" + + # Stale URLs in historical blog posts and one current memory doc. + # Blog posts are append-only / immutable historical content; the linked + # files were deleted (pdf_converter.ipynb) or moved (older + # microsoft.github.io/PyRIT/* paths from before the v2 site + # restructure). The dbeaver URL is real link rot — DBeaver + # reorganized their docs, the replacement isn't a clean 1:1. + # TODO(follow-up): clean these up editorially and tighten this + # suppression list. + - rule: link-resolves + severity: ignore + keys: + - "https://github.com/microsoft/PyRIT/blob/main/doc/code/converters/pdf_converter.ipynb" + - "https://microsoft.github.io/PyRIT/**" + - "https://dbeaver.com/docs/guides/sql_editors/sqlite.html" toc: - file: index.md - file: getting_started/README.md From a0abf51d99567f3ce2d7c3ce1e54ecccd29813d8 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 12:06:37 -0700 Subject: [PATCH 05/10] DOC: fix stale URLs instead of suppressing them in error_rules Replaces the broad link-resolves suppression lists with actual URL fixes: - doc/contributing/3_style_guide.md: wrap pyrit.shared.foo in backticks (was parsed as a URL) - doc/contributing/10_release_process.md: wrap release-template URLs (vx.y.z placeholders) in backticks so they render as code, not links - doc/blog/2025_06_06.md: repoint deleted pdf_converter.ipynb link to current pdf_converter.py source - doc/blog/2025_01_27.md: repoint 3 footnote URLs from stale microsoft.github.io/PyRIT/ paths to current relative paths / source files - doc/code/memory/4_manually_working_with_memory.md: update dbeaver SQLite docs URL - pyrit/models/storage_io.py: wrap example Azure Blob URLs in RST double-backticks so the auto-generated API page doesn't parse them as links doc/myst.yml error_rules now contains only the truly unfixable cases: tex-renders (mystmd v2 LaTeX renderer gap) and auth-required external APIs (platform.openai.com, api.openai.com, cognitiveservices.azure.com). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/blog/2025_01_27.md | 6 +-- doc/blog/2025_06_06.md | 2 +- .../memory/4_manually_working_with_memory.md | 2 +- doc/contributing/10_release_process.md | 8 ++-- doc/contributing/3_style_guide.md | 2 +- doc/myst.yml | 38 +++---------------- pyrit/models/storage_io.py | 15 ++++---- 7 files changed, 23 insertions(+), 50 deletions(-) diff --git a/doc/blog/2025_01_27.md b/doc/blog/2025_01_27.md index a151ab321f..9700ef26a5 100644 --- a/doc/blog/2025_01_27.md +++ b/doc/blog/2025_01_27.md @@ -78,11 +78,11 @@ Finally, when PyRIT gets a response from the Target LLM, it switches to another When examining this request, you may discover that occasionally the Adversarial LLM struggles with generating the right JSON format, leading to an error in PyRIT, regardless of whether the objective was achieved or not. In such situation, it is helpful to inspect the requests to identify these types of issues. Specifically, I found a problem when the LLM response contained double quotes, causing issues with subsequent JSON formats which was fixed using the "SearchReplaceConverter"[^9] prompt converter. -[^7]: "Multi-Turn Attack - RedTeamingAttack Example", https://microsoft.github.io/PyRIT/code/executor/attack/2_red_teaming_attack.html +[^7]: "Multi-Turn Attack - RedTeamingAttack Example", ../code/executor/attack/2_red_teaming_attack.ipynb -[^8]: "PyRIT - SearchReplaceConverter", https://microsoft.github.io/PyRIT/_autosummary/pyrit.prompt_converter.SearchReplaceConverter.html +[^8]: "PyRIT - SearchReplaceConverter", https://github.com/microsoft/PyRIT/blob/main/pyrit/prompt_converter/search_replace_converter.py -[^9]: "PyRIT - True False Scoring", https://microsoft.github.io/PyRIT/code/scoring/2_true_false_scorers.html#true-false-scoring +[^9]: "PyRIT - True False Scoring", ../code/scoring/2_true_false_scorers.ipynb#true-false-scoring ### Final Thoughts diff --git a/doc/blog/2025_06_06.md b/doc/blog/2025_06_06.md index c9ff1e2a01..8ad7e6dfee 100644 --- a/doc/blog/2025_06_06.md +++ b/doc/blog/2025_06_06.md @@ -12,7 +12,7 @@ The [AI Recruiter](https://github.com/KutalVolkan/ai_recruiter) is designed to m - Résumé Processing & Semantic Matching: Résumés are extracted from PDFs, with embeddings generated using models like text-embedding-ada-002. These embeddings enable semantic matching, while GPT-4o is later used to assign a match score based on relevance and extracted content. -- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](https://github.com/microsoft/PyRIT/blob/main/doc/code/converters/pdf_converter.ipynb)) that optimizes scoring, influencing the AI Recruiter’s ranking system. +- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](https://github.com/microsoft/PyRIT/blob/main/pyrit/prompt_converter/pdf_converter.py)) that optimizes scoring, influencing the AI Recruiter’s ranking system. - [XPIA Attack](https://github.com/microsoft/PyRIT/blob/main/doc/code/executor/workflow/2_xpia_ai_recruiter.ipynb) Integration: PyRIT enables full automation of prompt injections, making AI vulnerability research efficient and reproducible. --- diff --git a/doc/code/memory/4_manually_working_with_memory.md b/doc/code/memory/4_manually_working_with_memory.md index f20af7a0e7..7c29b6846b 100644 --- a/doc/code/memory/4_manually_working_with_memory.md +++ b/doc/code/memory/4_manually_working_with_memory.md @@ -13,7 +13,7 @@ The second way to share data is to use local SQLite Memory (see [here](../memory 1. Export and import the database as described [here](https://dbeaver.com/docs/dbeaver/Data-transfer/). This allows a lot of flexibility and can include partial exports (for example based on labels or time): 2. Copy the PyRIT `results/dbdata` directory over; it will contain multi-modal data that the database references. -See https://dbeaver.com/docs/guides/sql_editors/sqlite.html for a more comprehensive guide on using DBeaver with SQLite. +See https://dbeaver.com/docs/dbeaver/Database-driver-SQLite/ for a more comprehensive guide on using DBeaver with SQLite. ## Using SQLite and Excel to Query and Visualize Data diff --git a/doc/contributing/10_release_process.md b/doc/contributing/10_release_process.md index a5aaa644fd..bb9ac4ae86 100644 --- a/doc/contributing/10_release_process.md +++ b/doc/contributing/10_release_process.md @@ -75,15 +75,15 @@ links work properly. Note: There may not be any links to update, but it is good practice to check in case our README changes. Replace all "main" links like -"https://github.com/microsoft/PyRIT/blob/main/doc/index.md" with "raw" links that have +`https://github.com/microsoft/PyRIT/blob/main/doc/index.md` with "raw" links that have the correct version number, i.e., -"https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/doc/index.md". +`https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/doc/index.md`. For images, update using the "raw" link, e.g., -"https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/assets/pyrit_architecture.png". +`https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/assets/pyrit_architecture.png`. For directories, update using the "tree" link, e.g., -"https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/doc/code" +`https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/doc/code` This is required for the release branch because PyPI does not pick up other files besides the README, which results in local links breaking. diff --git a/doc/contributing/3_style_guide.md b/doc/contributing/3_style_guide.md index 92e9d2ed38..192b65398b 100644 --- a/doc/contributing/3_style_guide.md +++ b/doc/contributing/3_style_guide.md @@ -26,7 +26,7 @@ Deviations from any particular rule can occur depending on context and need. - One parameter per line - Spaces not tabs. Tab value must be 4 spaces - Test names should be test_foo test_bar, test_baz, etc. -- In the case of type name conflicts, the desired type should be used in its fully-qualified (or disambiguating relatively-qualified) form: e.g. "pyrit.shared.foo" +- In the case of type name conflicts, the desired type should be used in its fully-qualified (or disambiguating relatively-qualified) form: e.g. `pyrit.shared.foo` - Naming should follow typical Python naming. e.g. some_descriptive_name - PyRIT imports go last (after a newline) - Imports should go in alphabetical order diff --git a/doc/myst.yml b/doc/myst.yml index 80361fe65c..792e935dbb 100644 --- a/doc/myst.yml +++ b/doc/myst.yml @@ -23,9 +23,11 @@ project: - rule: tex-renders severity: ignore - # External docs that require auth — always 401/403 from CI workers and - # public link checkers. The URLs themselves are correct; they just can't - # be verified without credentials. + # External APIs that require auth — always return 401/403 from CI workers + # and public link checkers. The URLs themselves are correct; they just + # can't be verified without credentials. Every other broken URL in the + # tree has been fixed (see PR #1745); only add to this list when a URL + # is genuinely unverifiable, not just inconvenient to update. - rule: link-resolves severity: ignore keys: @@ -34,36 +36,6 @@ project: - "http://api.openai.com" - "http://api.openai.com/**" - "https://cognitiveservices.azure.com/**" - - # Intentional placeholder URLs that are not meant to resolve: - # - pyrit.shared.foo is a fake example endpoint in the style guide. - # - account.blob.core.windows.net/container/... is a generic Azure - # Blob Storage URL shape used in pyrit.models docstrings. - # - PyRIT/releases/vx.y.z/... are template placeholders in - # contributing/10_release_process.md (operators substitute the - # real version at release time). - - rule: link-resolves - severity: ignore - keys: - - "http://pyrit.shared.foo" - - "https://account.blob.core.windows.net/**" - - "https://raw.githubusercontent.com/microsoft/PyRIT/releases/vx.y.z/**" - - "https://github.com/microsoft/PyRIT/tree/releases/vx.y.z/**" - - # Stale URLs in historical blog posts and one current memory doc. - # Blog posts are append-only / immutable historical content; the linked - # files were deleted (pdf_converter.ipynb) or moved (older - # microsoft.github.io/PyRIT/* paths from before the v2 site - # restructure). The dbeaver URL is real link rot — DBeaver - # reorganized their docs, the replacement isn't a clean 1:1. - # TODO(follow-up): clean these up editorially and tighten this - # suppression list. - - rule: link-resolves - severity: ignore - keys: - - "https://github.com/microsoft/PyRIT/blob/main/doc/code/converters/pdf_converter.ipynb" - - "https://microsoft.github.io/PyRIT/**" - - "https://dbeaver.com/docs/guides/sql_editors/sqlite.html" toc: - file: index.md - file: getting_started/README.md diff --git a/pyrit/models/storage_io.py b/pyrit/models/storage_io.py index 9f050b4c62..3491b4b749 100644 --- a/pyrit/models/storage_io.py +++ b/pyrit/models/storage_io.py @@ -300,9 +300,9 @@ async def read_file(self, path: Union[Path, str]) -> bytes: """ Asynchronously reads the content of a file (blob) from Azure Blob Storage. - If the provided `path` is a full URL - (e.g., "https://account.blob.core.windows.net/container/dir1/dir2/sample.png"), - it extracts the relative blob path (e.g., "dir1/dir2/sample.png") to correctly access the blob. + If the provided ``path`` is a full URL + (e.g., ``https://account.blob.core.windows.net/container/dir1/dir2/sample.png``), + it extracts the relative blob path (e.g., ``dir1/dir2/sample.png``) to correctly access the blob. If a relative path is provided, it will use it as-is. Args: @@ -313,10 +313,11 @@ async def read_file(self, path: Union[Path, str]) -> bytes: bytes: The content of the file (blob) as bytes. Example: - file_content = - await read_file("https://account.blob.core.windows.net/container/dir2/1726627689003831.png") - # Or using a relative path: - file_content = await read_file("dir1/dir2/1726627689003831.png") + ``file_content = await read_file("https://account.blob.core.windows.net/container/dir2/1726627689003831.png")`` + + Or using a relative path: + + ``file_content = await read_file("dir1/dir2/1726627689003831.png")`` """ if not self._client_async: From d7daca96f9eaad3335ef25a401339430b0180f87 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 12:11:25 -0700 Subject: [PATCH 06/10] DOC: dedupe Makefile docs-build-all target after merging main The merge brought in main's docs-build-all (from PR #1740) while our branch already had it (since we merged the same PR #1740 branch earlier). The duplicate-second-wins behavior of make would have silently dropped --strict from docs-build-all. Removed the duplicate (without --strict). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Makefile | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Makefile b/Makefile index feacf18981..4b6755a9ca 100644 --- a/Makefile +++ b/Makefile @@ -39,17 +39,6 @@ docs-build-all: cd doc && uv run jupyter-book build --all --html --pdf --strict uv run ./build_scripts/generate_rss.py -# Build the full documentation site including the PDF export. -# Mirrors the ReadTheDocs build (.readthedocs.yaml) so CI catches PDF-only issues -# such as missing images that the HTML-only build silently ignores. -# Requires xelatex / latexmk on PATH (texlive-xetex + texlive-fonts-recommended + -# texlive-plain-generic + latexmk on Ubuntu). -docs-build-all: - uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json - uv run python build_scripts/gen_api_md.py - cd doc && uv run jupyter-book build --all --html --pdf - uv run ./build_scripts/generate_rss.py - # Regenerate only the API reference pages (without building the full site) docs-api: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json From 75133348feb03dafc28b458006d6624d3a186f17 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 12:20:04 -0700 Subject: [PATCH 07/10] DOC: link SearchReplaceConverter footnote to autosummary page, not GitHub source Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/blog/2025_01_27.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/blog/2025_01_27.md b/doc/blog/2025_01_27.md index 9700ef26a5..4c83af64b6 100644 --- a/doc/blog/2025_01_27.md +++ b/doc/blog/2025_01_27.md @@ -80,7 +80,7 @@ When examining this request, you may discover that occasionally the Adversarial [^7]: "Multi-Turn Attack - RedTeamingAttack Example", ../code/executor/attack/2_red_teaming_attack.ipynb -[^8]: "PyRIT - SearchReplaceConverter", https://github.com/microsoft/PyRIT/blob/main/pyrit/prompt_converter/search_replace_converter.py +[^8]: "PyRIT - SearchReplaceConverter", ../api/pyrit_prompt_converter.md#searchreplaceconverter [^9]: "PyRIT - True False Scoring", ../code/scoring/2_true_false_scorers.ipynb#true-false-scoring From d312d29fbdc9d80393aad108579f2937a0c08293 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 12:23:12 -0700 Subject: [PATCH 08/10] DOC: link PDF converter footnote to file_converters notebook PDFConverter section Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- doc/blog/2025_06_06.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/blog/2025_06_06.md b/doc/blog/2025_06_06.md index 8ad7e6dfee..c32c43808b 100644 --- a/doc/blog/2025_06_06.md +++ b/doc/blog/2025_06_06.md @@ -12,7 +12,7 @@ The [AI Recruiter](https://github.com/KutalVolkan/ai_recruiter) is designed to m - Résumé Processing & Semantic Matching: Résumés are extracted from PDFs, with embeddings generated using models like text-embedding-ada-002. These embeddings enable semantic matching, while GPT-4o is later used to assign a match score based on relevance and extracted content. -- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](https://github.com/microsoft/PyRIT/blob/main/pyrit/prompt_converter/pdf_converter.py)) that optimizes scoring, influencing the AI Recruiter’s ranking system. +- Automated RAG Vulnerability Testing: Attackers can manipulate résumé content by injecting hidden text (via a [PDF converter](../code/converters/5_file_converters.ipynb#pdfconverter)) that optimizes scoring, influencing the AI Recruiter’s ranking system. - [XPIA Attack](https://github.com/microsoft/PyRIT/blob/main/doc/code/executor/workflow/2_xpia_ai_recruiter.ipynb) Integration: PyRIT enables full automation of prompt injections, making AI vulnerability research efficient and reproducible. --- From 87a95d1fa0981077548a980cbec780d3ceb24897 Mon Sep 17 00:00:00 2001 From: romanlutz Date: Mon, 18 May 2026 19:28:31 -0700 Subject: [PATCH 09/10] DOC: remove check_links.py pre-commit hook now that --strict covers it Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pre-commit-config.yaml | 10 - build_scripts/check_links.py | 208 ------------------- tests/unit/build_scripts/test_check_links.py | 88 -------- 3 files changed, 306 deletions(-) delete mode 100644 build_scripts/check_links.py delete mode 100644 tests/unit/build_scripts/test_check_links.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8342a80b84..e422b3d987 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -69,16 +69,6 @@ repos: name: Ruff (Jupyter Notebooks) args: [--fix] - - repo: local - hooks: - - id: check-links - name: Check Links in Python and md Files - entry: python ./build_scripts/check_links.py - language: python - files: ^doc.*\.(py|md)$ - additional_dependencies: ['requests'] - exclude: (release_process.md|git.md|^doc/deployment/|tests|pyrit/prompt_converter/morse_converter.py|.github|pyrit/prompt_converter/emoji_converter.py|pyrit/score/markdown_injection.py|^pyrit/datasets/|^pyrit/auxiliary_attacks/gcg/) - - repo: https://github.com/allganize/ty-pre-commit rev: v0.0.32 hooks: diff --git a/build_scripts/check_links.py b/build_scripts/check_links.py deleted file mode 100644 index 827a47c2a3..0000000000 --- a/build_scripts/check_links.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import os -import re -import sys -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from urllib.parse import urlsplit, urlunsplit - -import requests - -skipped_urls = [ - "https://cognitiveservices.azure.com/.default", - "https://gandalf.lakera.ai/api/send-message", - "https://code.visualstudio.com/Download", # This will block python requests - "https://platform.openai.com/docs/api-reference/introduction", # blocks python requests - "https://platform.openai.com/docs/api-reference/responses", # blocks python requests - "https://platform.openai.com/docs/guides/function-calling", # blocks python requests - "https://platform.openai.com/docs/guides/structured-outputs", # blocks python requests - "https://platform.openai.com/api-keys", # blocks python requests (requires auth) - "https://www.anthropic.com/research/many-shot-jailbreaking", # blocks python requests - "https://doi.org/10.1145/3749447", # ACM blocks automated requests - "https://azure.microsoft.com/free/", # Azure blocks automated requests - "https://code.visualstudio.com/docs/devcontainers/containers", - "https://stackoverflow.com/questions/77134272/pip-install-dev-with-pyproject-toml-not-working", - "https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers", -] - -custom_myst_references = ["notebook_tests", "mistralai_mixtral_8x7b_instruct_v0_1"] - -# Updated regex pattern to capture URLs from Markdown and HTML -URL_PATTERN = re.compile(r'\[.*?\]\((.*?)\)|href="([^"]+)"|src="([^"]+)"') - -# Pattern to capture :link: directives from MyST grid-item-cards -GRID_LINK_PATTERN = re.compile(r"^:link:\s+(.+)$", re.MULTILINE) - - -def extract_urls(file_path): - with open(file_path, encoding="utf-8") as file: - content = file.read() - matches = URL_PATTERN.findall(content) - # Flatten the list of tuples and filter out empty strings - urls = [strip_fragment(url) for match in matches for url in match if url] - - # Extract :link: directives from MyST grid-item-cards - grid_links = GRID_LINK_PATTERN.findall(content) - urls.extend(grid_links) - - return urls - - -def strip_fragment(url): - """ - Removes the fragment (#...) from the URL, so the base URL can be checked. - """ - parsed_url = urlsplit(url) - return urlunsplit((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.query, "")) - - -def resolve_relative_url(base_path, url): - if not url.startswith(("http://", "https://", "mailto:", "attachment:")): - # Handle MyST doc references (e.g., setup/1b_install_docker) - # These can be .md, .rst, or directory paths - abs_path = os.path.abspath(os.path.join(os.path.dirname(base_path), url)) - - # Check various possible file extensions for doc links - if not os.path.exists(abs_path): - for ext in [".md", ".ipynb"]: - if os.path.exists(abs_path + ext): - return abs_path + ext - - return abs_path - return url - - -def check_url(url, retries=2, delay=2): - """ - Check the validity of a URL, with retries if it fails. - - Args: - url (str): URL to check. - retries (int, optional): Number of retries if the URL check fails. Defaults to 2. - delay (int, optional): Delay in seconds between retries. Defaults to 2. - Returns: - tuple: A tuple containing the URL and a boolean indicating whether it is valid. - """ - - if ( - "http://localhost:" in url - or url in skipped_urls - or any(url.endswith(reference) for reference in custom_myst_references) - or os.path.isfile(url) - or os.path.isdir(url) - or url.startswith(("mailto:", "attachment:")) - ): - return url, True - - # If it's not an HTTP URL at this point, it's likely a broken local file reference - if not url.startswith(("http://", "https://")): - return url, False - - attempts = 0 - while attempts <= retries: - try: - response = requests.head(url, allow_redirects=True, timeout=5) - if response.status_code >= 400: - attempts += 1 - if attempts > retries: - return url, False - time.sleep(delay) - else: - return url, True - except requests.RequestException: - attempts += 1 - if attempts > retries: - return url, False - time.sleep(delay) - - # If we exit the loop without returning, the URL is broken - return url, False - - -def extract_all_urls_from_files(files): - """ - Extract all URLs from all files, returning a dict of {file_path: [urls]}. - """ - file_urls = {} - skipped_files = ["doc/blog/"] - - for file_path in files: - if any(file_path.startswith(skipped) for skipped in skipped_files): - continue - urls = extract_urls(file_path) - resolved_urls = [resolve_relative_url(file_path, url) for url in urls] - if resolved_urls: - file_urls[file_path] = resolved_urls - - return file_urls - - -def check_all_links_parallel(file_urls, max_workers=20): - """ - Check all URLs across all files in parallel with a shared thread pool. - - Args: - file_urls: Dict of {file_path: [urls]} - max_workers: Max concurrent HTTP requests across ALL files - - Returns: - Dict of {file_path: [broken_urls]} - """ - all_broken_urls = {} - - # Create a mapping of url -> file_path for tracking which file each URL came from - url_to_files = {} - for file_path, urls in file_urls.items(): - for url in urls: - if url not in url_to_files: - url_to_files[url] = [] - url_to_files[url].append(file_path) - - # Check all unique URLs in parallel - url_results = {} - with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = {executor.submit(check_url, url): url for url in url_to_files} - for future in as_completed(futures): - url = futures[future] - _, is_valid = future.result() - url_results[url] = is_valid - - # Map broken URLs back to their files - for url, is_valid in url_results.items(): - if not is_valid: - for file_path in url_to_files[url]: - if file_path not in all_broken_urls: - all_broken_urls[file_path] = [] - all_broken_urls[file_path].append(url) - - return all_broken_urls - - -if __name__ == "__main__": - files = sys.argv[1:] - - print(f"Extracting URLs from {len(files)} file(s)...") - file_urls = extract_all_urls_from_files(files) - - if not file_urls: - print("No URLs found to check.") - sys.exit(0) - - total_urls = sum(len(urls) for urls in file_urls.values()) - unique_urls = len({url for urls in file_urls.values() for url in urls}) - print(f"Checking {unique_urls} unique URL(s) across {len(file_urls)} file(s) (total: {total_urls})...") - - all_broken_urls = check_all_links_parallel(file_urls, max_workers=30) - - if all_broken_urls: - print("\n" + "=" * 80) - for file_path, urls in all_broken_urls.items(): - print(f"Broken links in {file_path}:") - for url in urls: - print(f" - {url}") - print("=" * 80) - sys.exit(1) - else: - print("No broken links found.") diff --git a/tests/unit/build_scripts/test_check_links.py b/tests/unit/build_scripts/test_check_links.py deleted file mode 100644 index 75bc4aec45..0000000000 --- a/tests/unit/build_scripts/test_check_links.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from pathlib import Path - -from build_scripts.check_links import extract_urls, resolve_relative_url, strip_fragment - - -def test_strip_fragment_removes_fragment() -> None: - assert strip_fragment("https://example.com/page#section") == "https://example.com/page" - - -def test_strip_fragment_no_fragment_unchanged() -> None: - assert strip_fragment("https://example.com/page") == "https://example.com/page" - - -def test_strip_fragment_empty_fragment() -> None: - assert strip_fragment("https://example.com/page#") == "https://example.com/page" - - -def test_strip_fragment_preserves_query_string() -> None: - result = strip_fragment("https://example.com/page?q=1#section") - assert "q=1" in result - assert "section" not in result - - -def test_resolve_relative_url_http_url_unchanged() -> None: - url = "https://example.com" - assert resolve_relative_url("/some/file.md", url) == url - - -def test_resolve_relative_url_mailto_unchanged() -> None: - url = "mailto:test@example.com" - assert resolve_relative_url("/some/file.md", url) == url - - -def test_resolve_relative_url_resolved(tmp_path: Path) -> None: - base = str(tmp_path / "docs" / "file.md") - target = tmp_path / "docs" / "other.md" - target.parent.mkdir(parents=True, exist_ok=True) - target.write_text("# Other") - result = resolve_relative_url(base, "other.md") - assert result == str(target) - - -def test_resolve_relative_url_with_md_extension(tmp_path: Path) -> None: - base = str(tmp_path / "docs" / "file.md") - target = tmp_path / "docs" / "other.md" - target.parent.mkdir(parents=True, exist_ok=True) - target.write_text("# Other") - result = resolve_relative_url(base, "other") - assert result.endswith(".md") - - -def test_extract_urls_extracts_markdown_links(tmp_path: Path) -> None: - f = tmp_path / "test.md" - f.write_text("[Click here](https://example.com)") - urls = extract_urls(str(f)) - assert "https://example.com" in urls - - -def test_extract_urls_extracts_href_links(tmp_path: Path) -> None: - f = tmp_path / "test.html" - f.write_text('link') - urls = extract_urls(str(f)) - assert "https://example.com" in urls - - -def test_extract_urls_extracts_src_links(tmp_path: Path) -> None: - f = tmp_path / "test.html" - f.write_text('') - urls = extract_urls(str(f)) - assert "https://example.com/image.png" in urls - - -def test_extract_urls_empty_file_returns_no_urls(tmp_path: Path) -> None: - f = tmp_path / "empty.md" - f.write_text("") - urls = extract_urls(str(f)) - assert urls == [] - - -def test_extract_urls_strips_fragments(tmp_path: Path) -> None: - f = tmp_path / "test.md" - f.write_text("[link](https://example.com/page#section)") - urls = extract_urls(str(f)) - assert "https://example.com/page" in urls - assert not any("#section" in u for u in urls) From 3312d039c4659075c519ef1291f8b2fcbb5cf30c Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 21 May 2026 06:18:04 -0700 Subject: [PATCH 10/10] DOC: comment Makefile docs targets to explain --strict scope Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 4b6755a9ca..e42722b191 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ ty: docs-build: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py + # --strict validates URLs and cross-refs; skips are configured in doc/myst.yml under error_rules cd doc && uv run jupyter-book build --all --html --strict uv run ./build_scripts/generate_rss.py @@ -36,6 +37,7 @@ docs-build: docs-build-all: uv run python build_scripts/pydoc2json.py pyrit --submodules -o doc/_api/pyrit_all.json uv run python build_scripts/gen_api_md.py + # --strict validates URLs and cross-refs; skips are configured in doc/myst.yml under error_rules cd doc && uv run jupyter-book build --all --html --pdf --strict uv run ./build_scripts/generate_rss.py