diff --git a/tests/run.py b/tests/run.py index f12c68c..315424c 100644 --- a/tests/run.py +++ b/tests/run.py @@ -34,6 +34,7 @@ uv run tests/run.py uv run tests/run.py --live uv run tests/run.py --performance --repeat 3 + uv run tests/run.py --performance --saml-org-stress --monitor-sourcegraph-load uv run tests/run.py --install uv run tests/run.py --update-golden """ @@ -83,6 +84,10 @@ DEFAULT_PROPERTY_ITERATIONS = 25 DEFAULT_PROPERTY_SEED = 20260610 DEFAULT_PERFORMANCE_REPEAT = 1 +DEFAULT_SAML_ORG_STRESS_USERS = 10000 +DEFAULT_SAML_ORG_STRESS_GROUPS = 10000 +DEFAULT_SAML_ORG_STRESS_GROUPS_PER_USER = 5 +DEFAULT_SAML_ORG_STRESS_PREFIX = "perms-sync-stress-group" EXPLICIT_REPOS_READ_BACK_QUERY = """ query TestExplicitRepoReadBack($username: String!, $first: Int!, $after: String) { @@ -312,6 +317,12 @@ class TestArguments: fail_on_memory_regression_mib: float | None jaeger_trace_limit: int external_sample_interval: float + saml_org_stress: bool + saml_org_stress_users: int + saml_org_stress_groups: int + saml_org_stress_groups_per_user: int + saml_org_stress_prefix: str + keep_saml_org_stress_data: bool monitor_sourcegraph_load: bool monitor_namespace: str monitor_frontend_target: str @@ -443,6 +454,43 @@ def parse_arguments(argv: Sequence[str] | None = None) -> TestArguments: help="Seconds between external process-tree RSS samples during performance cases; " "0 disables (default: 1.0)", ) + performance_group.add_argument( + "--saml-org-stress", + action="store_true", + help="Run only the opt-in sync-saml-orgs breaking-point stress case", + ) + performance_group.add_argument( + "--saml-org-stress-users", + type=int, + default=DEFAULT_SAML_ORG_STRESS_USERS, + help=f"Synthetic users to seed for --saml-org-stress (default: " + f"{DEFAULT_SAML_ORG_STRESS_USERS})", + ) + performance_group.add_argument( + "--saml-org-stress-groups", + type=int, + default=DEFAULT_SAML_ORG_STRESS_GROUPS, + help=f"Distinct SAML groups/orgs to generate for --saml-org-stress " + f"(default: {DEFAULT_SAML_ORG_STRESS_GROUPS})", + ) + performance_group.add_argument( + "--saml-org-stress-groups-per-user", + type=int, + default=DEFAULT_SAML_ORG_STRESS_GROUPS_PER_USER, + help=f"SAML groups assigned to each stress user (default: " + f"{DEFAULT_SAML_ORG_STRESS_GROUPS_PER_USER})", + ) + performance_group.add_argument( + "--saml-org-stress-prefix", + default=DEFAULT_SAML_ORG_STRESS_PREFIX, + help=f"Synthetic SAML group prefix for --saml-org-stress (default: " + f"{DEFAULT_SAML_ORG_STRESS_PREFIX})", + ) + performance_group.add_argument( + "--keep-saml-org-stress-data", + action="store_true", + help="Leave generated stress SAML accounts and synced orgs in the instance", + ) monitor_group = parser.add_argument_group("sourcegraph load monitor") monitor_group.add_argument( "--monitor-sourcegraph-load", @@ -500,6 +548,12 @@ def parse_arguments(argv: Sequence[str] | None = None) -> TestArguments: fail_on_memory_regression_mib=cast("float | None", options.fail_on_memory_regression_mib), jaeger_trace_limit=int(options.jaeger_trace_limit), external_sample_interval=float(options.external_sample_interval), + saml_org_stress=bool(options.saml_org_stress), + saml_org_stress_users=int(options.saml_org_stress_users), + saml_org_stress_groups=int(options.saml_org_stress_groups), + saml_org_stress_groups_per_user=int(options.saml_org_stress_groups_per_user), + saml_org_stress_prefix=str(options.saml_org_stress_prefix), + keep_saml_org_stress_data=bool(options.keep_saml_org_stress_data), monitor_sourcegraph_load=bool(options.monitor_sourcegraph_load), monitor_namespace=str(options.monitor_namespace), monitor_frontend_target=str(options.monitor_frontend_target), @@ -2605,6 +2659,184 @@ def performance_variants(self) -> list[tuple[str, tuple[str, ...]]]: baseline = ("baseline", tuple(shlex.split(self.arguments.baseline_command))) return [baseline, candidate] + def prepare_saml_org_stress(self) -> Callable[[], None] | None: + """Seed generated SAML accounts before the measured stress run.""" + if self.arguments.baseline_command: + self.record( + "perf: sync-saml-orgs stress setup", + "performance", + False, + 0.0, + "--saml-org-stress cannot be combined with --baseline-command", + ) + return None + if self.arguments.test_filter: + self.filter_matched_count += 1 + log.warning("--saml-org-stress ignores the --performance TESTS filter.") + + started = time.monotonic() + try: + from src_auth_perms_sync.shared.saml_groups import organization_name_for_saml_group + from tests import setup as instance_setup + + setup_config = load_setup_config() + provider = self.saml_auth_provider() + if provider is None: + self.record( + "perf: sync-saml-orgs stress setup", + "performance", + False, + time.monotonic() - started, + f"no SAML auth provider on {self.endpoint}", + ) + return None + + kubectl_config = cast("dict[str, Any]", setup_config["kubectl"]) + users_config = cast("dict[str, Any]", setup_config["users"]) + configured_accounts = cast( + "dict[str, list[str]]", setup_config.get("samlAccounts") or {} + ) + instance_setup.validate_saml_stress_inputs( + self.arguments.saml_org_stress_users, + self.arguments.saml_org_stress_groups, + self.arguments.saml_org_stress_groups_per_user, + self.arguments.saml_org_stress_prefix, + ) + organization_name_prefix = organization_name_for_saml_group( + provider["configID"], self.arguments.saml_org_stress_prefix + ) + + if not self.cleanup_saml_org_stress_data( + "perf: sync-saml-orgs stress stale cleanup", + kubectl_config, + users_config, + configured_accounts, + provider, + organization_name_prefix, + ): + return None + + account_count = instance_setup.upsert_saml_stress_accounts( + kubectl_config, + users_config, + service_id=provider["serviceID"], + client_id=provider["clientID"], + user_count=self.arguments.saml_org_stress_users, + group_count=self.arguments.saml_org_stress_groups, + groups_per_user=self.arguments.saml_org_stress_groups_per_user, + group_prefix=self.arguments.saml_org_stress_prefix, + ) + except Exception as exception: # noqa: BLE001 - record, don't kill the suite. + self.record( + "perf: sync-saml-orgs stress setup", + "performance", + False, + time.monotonic() - started, + f"{type(exception).__name__}: {exception}", + ) + return None + + desired_memberships = ( + self.arguments.saml_org_stress_users * self.arguments.saml_org_stress_groups_per_user + ) + detail = ( + f"seeded {account_count}/{self.arguments.saml_org_stress_users} user account(s), " + f"{self.arguments.saml_org_stress_groups} group(s), " + f"{desired_memberships} desired membership(s)" + ) + passed = account_count == self.arguments.saml_org_stress_users + self.record( + "perf: sync-saml-orgs stress setup", + "performance", + passed, + time.monotonic() - started, + detail, + ) + log.info("SAML org stress setup: %s", detail) + if not passed: + self.cleanup_saml_org_stress_data( + "perf: sync-saml-orgs stress failed-setup cleanup", + kubectl_config, + users_config, + configured_accounts, + provider, + organization_name_prefix, + ) + return None + + if self.arguments.keep_saml_org_stress_data: + log.warning( + "Keeping SAML org stress data. Re-run without --keep-saml-org-stress-data " + "to clean it before the next measured stress run." + ) + + def cleanup() -> None: + self.cleanup_saml_org_stress_data( + "perf: sync-saml-orgs stress cleanup", + kubectl_config, + users_config, + configured_accounts, + provider, + organization_name_prefix, + ) + + return cleanup + + def cleanup_saml_org_stress_data( + self, + name: str, + kubectl_config: dict[str, Any], + users_config: dict[str, Any], + configured_accounts: dict[str, list[str]], + provider: dict[str, str], + organization_name_prefix: str, + ) -> bool: + """Delete generated stress data and restore setup.yaml SAML accounts.""" + started = time.monotonic() + try: + from tests import setup as instance_setup + + deleted_members, deleted_organizations = ( + instance_setup.delete_saml_stress_organizations( + kubectl_config, organization_name_prefix + ) + ) + deleted_accounts = instance_setup.delete_saml_stress_accounts( + kubectl_config, + users_config, + service_id=provider["serviceID"], + client_id=provider["clientID"], + group_prefix=self.arguments.saml_org_stress_prefix, + ) + email_template = str(users_config["emailTemplate"]) + for username, groups in configured_accounts.items(): + instance_setup.upsert_saml_account( + kubectl_config, + username, + groups, + service_id=provider["serviceID"], + client_id=provider["clientID"], + account_id=email_template.replace("{username}", username), + ) + except Exception as exception: # noqa: BLE001 - record, don't kill the suite. + self.record( + name, + "performance", + False, + time.monotonic() - started, + f"{type(exception).__name__}: {exception}", + ) + return False + + detail = ( + f"deleted {deleted_organizations} org(s), {deleted_members} member row(s), " + f"{deleted_accounts} stress account(s); restored {len(configured_accounts)} " + "setup account(s)" + ) + self.record(name, "performance", True, time.monotonic() - started, detail) + log.info("SAML org stress cleanup: %s", detail) + return True + def run_performance(self) -> None: log.info( "\n=== Performance: repeat=%d, jaeger_trace_limit=%d ===", @@ -2617,6 +2849,11 @@ def run_performance(self) -> None: except (LiveAbort, SystemExit) as error: self.record("performance prerequisites", "performance", False, 0.0, str(error)) return + stress_cleanup: Callable[[], None] | None = None + if self.arguments.saml_org_stress: + stress_cleanup = self.prepare_saml_org_stress() + if stress_cleanup is None: + return trace_fetcher: JaegerTraceFetcher | None = None if self.arguments.jaeger_trace_limit > 0: trace_fetcher = JaegerTraceFetcher( @@ -2648,6 +2885,8 @@ def run_performance(self) -> None: finally: if load_monitor is not None: load_monitor.stop() + if stress_cleanup is not None and not self.arguments.keep_saml_org_stress_data: + stress_cleanup() self.write_performance_report(rows) self.check_memory_regressions(rows) @@ -2684,6 +2923,23 @@ def measure(case: CliCase) -> CliResult: ) return result + if self.arguments.saml_org_stress: + measure( + CliCase( + f"perf: sync-saml-orgs stress apply [{iteration}]", + ("sync-saml-orgs", "--full", "--apply", "--no-backup"), + 0, + ) + ) + measure( + CliCase( + f"perf: sync-saml-orgs stress idempotent apply [{iteration}]", + ("sync-saml-orgs", "--full", "--apply", "--no-backup"), + 0, + ) + ) + return rows + # The dry run is also the baseline snapshot source for the apply + # restore pair, so selecting the apply implies running the dry run. want_apply = self.select("perf: set --full apply", "perf: restore full") diff --git a/tests/setup.py b/tests/setup.py index a3db8b6..486eeeb 100644 --- a/tests/setup.py +++ b/tests/setup.py @@ -64,7 +64,9 @@ PENDING_PERMISSIONS_QUERY = "query SetupPending { usersWithPendingPermissions }" -def run_sql(kubectl_config: dict[str, Any], statement: str) -> list[list[str]]: +def run_sql( + kubectl_config: dict[str, Any], statement: str, *, timeout_seconds: int = 120 +) -> list[list[str]]: """Run SQL on the pgsql pod; return rows of pipe-separated fields.""" script = f"SET app.current_tenant = '{int(kubectl_config['tenantID'])}';\n{statement}" command = [ @@ -86,12 +88,41 @@ def run_sql(kubectl_config: dict[str, Any], statement: str) -> list[list[str]]: "-d", str(kubectl_config["database"]), ] - completed = subprocess.run(command, input=script, capture_output=True, text=True, timeout=120) + completed = subprocess.run( + command, + input=script, + capture_output=True, + text=True, + timeout=timeout_seconds, + ) if completed.returncode != 0: raise RuntimeError(f"psql failed: {completed.stderr.strip()}") return [line.split("|") for line in completed.stdout.splitlines() if line] +def sql_literal(value: str) -> str: + """Return a single-quoted SQL string literal.""" + return "'" + value.replace("'", "''") + "'" + + +def sql_like_literal(value: str) -> str: + """Return a quoted SQL LIKE pattern with wildcard characters escaped.""" + escaped = value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + return sql_literal(escaped) + + +def email_suffix(users_config: dict[str, Any]) -> str: + """Return the configured email suffix, validating the expected template shape.""" + template = str(users_config["emailTemplate"]) + suffix = template.replace("{username}", "") + if ( + not SAFE_NAME_PATTERN.match(suffix.lstrip("@")) + or template[: len("{username}")] != "{username}" + ): + raise RuntimeError(f"emailTemplate must be '{{username}}@': {template!r}") + return suffix + + def upsert_saml_account( kubectl_config: dict[str, Any], username: str, @@ -136,6 +167,184 @@ def upsert_saml_account( ) +def upsert_saml_stress_accounts( + kubectl_config: dict[str, Any], + users_config: dict[str, Any], + *, + service_id: str, + client_id: str, + user_count: int, + group_count: int, + groups_per_user: int, + group_prefix: str, +) -> int: + """Bulk-write generated SAML group claims for synthetic stress users.""" + validate_saml_stress_inputs(user_count, group_count, groups_per_user, group_prefix) + suffix = email_suffix(users_config) + rows = run_sql( + kubectl_config, + f""" +WITH selected_users AS MATERIALIZED ( + SELECT + u.id, + u.username, + row_number() OVER (ORDER BY u.username) - 1 AS user_index + FROM users u + WHERE u.username ~ {sql_literal(str(users_config["usernamePattern"]))} + AND u.deleted_at IS NULL + ORDER BY u.username + LIMIT {user_count} +), +group_claims AS ( + SELECT + selected_users.id AS user_id, + {sql_literal(group_prefix)} || '-' || + lpad(((selected_users.user_index * {groups_per_user} + offset_index) + % {group_count})::text, 5, '0') AS group_name + FROM selected_users + CROSS JOIN generate_series(0, {groups_per_user - 1}) AS generated_offsets(offset_index) +), +account_rows AS ( + SELECT + selected_users.id AS user_id, + selected_users.username || {sql_literal(suffix)} AS account_id, + jsonb_build_object( + 'NameID', selected_users.username || {sql_literal(suffix)}, + 'Values', jsonb_build_object( + 'groups', jsonb_build_object( + 'Name', 'groups', + 'Values', jsonb_agg( + jsonb_build_object('Value', group_claims.group_name) + ORDER BY group_claims.group_name + ) + ), + 'Email', jsonb_build_object( + 'Name', 'Email', + 'Values', jsonb_build_array( + jsonb_build_object('Value', selected_users.username || {sql_literal(suffix)}) + ) + ) + ) + )::text AS account_data + FROM selected_users + JOIN group_claims ON group_claims.user_id = selected_users.id + GROUP BY selected_users.id, selected_users.username +), +upserted AS ( + INSERT INTO user_external_accounts + (user_id, service_type, service_id, client_id, account_id, + account_data, encryption_key_id, kind) + SELECT + account_rows.user_id, + 'saml', + {sql_literal(service_id)}, + {sql_literal(client_id)}, + account_rows.account_id, + account_rows.account_data, + '', + 'AUTH' + FROM account_rows + ON CONFLICT (tenant_id, user_id, service_type, service_id, client_id, + account_id, kind) WHERE deleted_at IS NULL + DO UPDATE SET account_data = EXCLUDED.account_data, updated_at = now() + RETURNING 1 +) +SELECT count(*) FROM upserted; +""", + timeout_seconds=600, + ) + return int(rows[0][0]) if rows and rows[0] else 0 + + +def delete_saml_stress_accounts( + kubectl_config: dict[str, Any], + users_config: dict[str, Any], + *, + service_id: str, + client_id: str, + group_prefix: str, +) -> int: + """Delete generated SAML stress accounts from synthetic users.""" + if not SAFE_NAME_PATTERN.match(group_prefix): + raise RuntimeError(f"unsafe SAML stress group prefix: {group_prefix!r}") + rows = run_sql( + kubectl_config, + f""" +WITH deleted AS ( + DELETE FROM user_external_accounts account + USING users u + WHERE account.user_id = u.id + AND u.username ~ {sql_literal(str(users_config["usernamePattern"]))} + AND u.deleted_at IS NULL + AND account.deleted_at IS NULL + AND account.service_type = 'saml' + AND account.service_id = {sql_literal(service_id)} + AND account.client_id = {sql_literal(client_id)} + AND account.account_data::text LIKE '%' || {sql_like_literal(group_prefix + "-")} + || '%' + ESCAPE '\' + RETURNING 1 +) +SELECT count(*) FROM deleted; +""", + timeout_seconds=600, + ) + return int(rows[0][0]) if rows and rows[0] else 0 + + +def delete_saml_stress_organizations( + kubectl_config: dict[str, Any], organization_name_prefix: str +) -> tuple[int, int]: + """Delete generated SAML stress organizations and their memberships.""" + if not SAFE_NAME_PATTERN.match(organization_name_prefix): + raise RuntimeError(f"unsafe SAML stress organization prefix: {organization_name_prefix!r}") + rows = run_sql( + kubectl_config, + f""" +WITH target_orgs AS MATERIALIZED ( + SELECT id + FROM orgs + WHERE name::text LIKE {sql_like_literal(organization_name_prefix + "-")} || '%' + ESCAPE '\' + AND deleted_at IS NULL +), +deleted_members AS ( + DELETE FROM org_members + WHERE org_id IN (SELECT id FROM target_orgs) + RETURNING 1 +), +deleted_orgs AS ( + DELETE FROM orgs + WHERE id IN (SELECT id FROM target_orgs) + RETURNING 1 +) +SELECT + (SELECT count(*) FROM deleted_members), + (SELECT count(*) FROM deleted_orgs); +""", + timeout_seconds=600, + ) + if not rows or len(rows[0]) < 2: + return (0, 0) + return (int(rows[0][0]), int(rows[0][1])) + + +def validate_saml_stress_inputs( + user_count: int, group_count: int, groups_per_user: int, group_prefix: str +) -> None: + """Validate generated stress dimensions before interpolating SQL.""" + if user_count <= 0: + raise RuntimeError("SAML stress user count must be positive") + if group_count <= 0: + raise RuntimeError("SAML stress group count must be positive") + if groups_per_user <= 0: + raise RuntimeError("SAML stress groups per user must be positive") + if groups_per_user > group_count: + raise RuntimeError("SAML stress groups per user cannot exceed group count") + if not SAFE_NAME_PATTERN.match(group_prefix): + raise RuntimeError(f"unsafe SAML stress group prefix: {group_prefix!r}") + + @dataclass class Outcome: """One named check: in-sync, fixed, or needing attention."""