diff --git a/docs/architecture/00-baseline/v1/url-shortener-v1-hld.excalidraw b/docs/architecture/00-baseline/v1/url-shortener-v1-hld.excalidraw index 15afebf..f8b3507 100644 --- a/docs/architecture/00-baseline/v1/url-shortener-v1-hld.excalidraw +++ b/docs/architecture/00-baseline/v1/url-shortener-v1-hld.excalidraw @@ -2142,8 +2142,8 @@ "frameId": null, "x": -220.66529003471368, "y": -303.5077851544676, - "width": 171.50159565320678, - "height": 128.9607875234742, + "width": 153.29535786023803, + "height": 125.23422502347421, "angle": 0, "strokeColor": "#f08c00", "backgroundColor": "transparent", @@ -2155,8 +2155,8 @@ "groupIds": [], "roundness": null, "seed": 1036090294, - "version": 4124, - "versionNonce": 675132968, + "version": 4179, + "versionNonce": 648276660, "isDeleted": false, "boundElements": [ { @@ -2164,7 +2164,7 @@ "type": "text" } ], - "updated": 1774769149312, + "updated": 1780887497358, "link": null, "locked": false, "points": [ @@ -2174,15 +2174,15 @@ ], [ 0, - 63.59563320416075 + 61.73235195416075 ], [ - 171.50159565320678, - 63.59563320416075 + 153.29535786023803, + 61.73235195416075 ], [ - 171.50159565320678, - 128.9607875234742 + 153.29535786023803, + 125.23422502347421 ] ], "lastCommittedPoint": null, @@ -2259,10 +2259,10 @@ "type": "arrow", "index": "agG", "frameId": null, - "x": -50.10924709348319, - "y": -54.621333771234035, - "width": 308.1567959810417, - "height": 175.36087468829965, + "x": -68.31548488645194, + "y": -58.347896271234035, + "width": 326.36303377401043, + "height": 179.08743718829965, "angle": 0, "strokeColor": "#f08c00", "backgroundColor": "transparent", @@ -2274,8 +2274,8 @@ "groupIds": [], "roundness": null, "seed": 1565690968, - "version": 4831, - "versionNonce": 913617192, + "version": 4887, + "versionNonce": 1655437108, "isDeleted": false, "boundElements": [ { @@ -2283,7 +2283,7 @@ "type": "text" } ], - "updated": 1774769149313, + "updated": 1780887497472, "link": null, "locked": false, "points": [ @@ -2293,19 +2293,19 @@ ], [ 0, - 111.42588623872977 + 115.15244873872977 ], [ - 97.94944876125368, - 111.42588623872977 + 116.15568655422243, + 115.15244873872977 ], [ - 97.94944876125368, - 175.36087468829965 + 116.15568655422243, + 179.08743718829965 ], [ - 308.1567959810417, - 175.36087468829965 + 326.36303377401043, + 179.08743718829965 ] ], "lastCommittedPoint": null, @@ -2341,22 +2341,22 @@ "index": 2, "start": [ 0, - 111.42588623872977 + 115.15244873872977 ], "end": [ - 97.94944876125368, - 111.42588623872977 + 116.15568655422243, + 115.15244873872977 ] }, { "index": 3, "start": [ - 97.94944876125368, - 111.42588623872977 + 116.15568655422243, + 115.15244873872977 ], "end": [ - 97.94944876125368, - 175.36087468829965 + 116.15568655422243, + 179.08743718829965 ] } ], @@ -2405,10 +2405,10 @@ "type": "arrow", "index": "agd", "frameId": null, - "x": -179.99130314362878, - "y": -54.43363240533003, - "width": 0.5090771940789693, - "height": 183.66922976935638, + "x": -198.19754093659753, + "y": -58.16019490533003, + "width": 53.18648124659856, + "height": 188.37120351866412, "angle": 0, "strokeColor": "#f08c00", "backgroundColor": "transparent", @@ -2420,8 +2420,8 @@ "groupIds": [], "roundness": null, "seed": 777376600, - "version": 5581, - "versionNonce": 843957800, + "version": 5636, + "versionNonce": 336055732, "isDeleted": false, "boundElements": [ { @@ -2429,7 +2429,7 @@ "type": "text" } ], - "updated": 1774769151817, + "updated": 1780887497359, "link": null, "locked": false, "points": [ @@ -2438,8 +2438,16 @@ 0 ], [ - 0.5090771940789693, - 183.66922976935638 + 0, + 94.08843335104227 + ], + [ + 53.18648124659856, + 94.08843335104227 + ], + [ + 53.18648124659856, + 188.37120351866412 ] ], "lastCommittedPoint": null, @@ -3298,8 +3306,8 @@ "type": "rectangle", "index": "ay", "frameId": null, - "x": -209.12305427739824, - "y": -169.6279692219096, + "x": -227.329292070367, + "y": -173.3545317219096, "width": 199.74500710497375, "height": 110, "angle": 0, @@ -3315,8 +3323,8 @@ "type": 3 }, "seed": 1430654040, - "version": 1498, - "versionNonce": 2066601560, + "version": 1553, + "versionNonce": 968050612, "isDeleted": false, "boundElements": [ { @@ -3336,15 +3344,15 @@ "type": "arrow" } ], - "updated": 1774769149311, + "updated": 1780887497357, "link": null, "locked": false }, { "id": "qKIIAMdmMGTIov9tNsvuj", "type": "text", - "x": -197.94049213116136, - "y": -164.6279692219096, + "x": -216.1467299241301, + "y": -168.3545317219096, "width": 177.3798828125, "height": 100, "angle": 0, @@ -3360,11 +3368,11 @@ "index": "az", "roundness": null, "seed": 376619352, - "version": 283, - "versionNonce": 921661992, + "version": 338, + "versionNonce": 2027434292, "isDeleted": false, "boundElements": [], - "updated": 1774769106192, + "updated": 1780887497357, "link": null, "locked": false, "text": "Cloudflare\n(DDoS Absorption\nWAF Rate Limiting\nDNS Proxy)", diff --git a/docs/insights/Extracting Training Data from Large Language Models Review.txt b/docs/insights/Extracting Training Data from Large Language Models Review.txt new file mode 100644 index 0000000..fc6fdae --- /dev/null +++ b/docs/insights/Extracting Training Data from Large Language Models Review.txt @@ -0,0 +1,14 @@ +Title: Extracting Training Data from Large Language Models + +Professor Name: Habeeb Olufowobi +Student Name: Harshwardhan Patil +Student ID: 1002224144 +Date: 04.22.2026 + +This paper by Carlini and colleagues investigates whether an adversary can recover verbatim text from a large language model's training data using nothing but black-box query access. The authors target GPT-2, a 1.5 billion parameter model trained on 40 gigabytes of public internet text, and demonstrate a two-phase attack. The first phase generates 200,000 text samples from the model using three different generation strategies, basic top-n sampling, decaying temperature sampling, and seeding with real internet text prefixes scraped from Common Crawl. The second phase filters those samples using six membership inference metrics, all based on comparing GPT-2's likelihood against a reference, either a smaller GPT-2 variant or a classical compression algorithm called zlib, to find samples where GPT-2 is anomalously confident. The authors verify results through both Google search and direct query access to OpenAI's training dataset, confirming 604 unique memorized examples with a best-case precision of 67%. Importantly, the paper frames this 604 figure explicitly as a lower bound, stating that among 600,000 honestly generated samples, at least 0.1% contain memorized text, and that this 0.1% is itself an extremely loose floor given that the extraction only used simple short prompts, and nearly no extracted example could be reproduced with the short prompt that originally surfaced it but nearly all reproduced when given the full preceding training context. + +The paper's central contribution goes beyond the attack itself. Before defining the attack, the authors formally define what it means for a model to know a string, a string is considered extractable if there exists some prefix that causes the model to generate it as its most likely continuation. This definition is the foundation everything else builds on. They then introduce k-eidetic memorization, where k represents how many distinct training documents contain the memorized string, with k equals 1 being the most sensitive case. One subtle but important aspect of this definition is that it counts distinct documents, not total occurrences, meaning a string that appears 50 times within a single document still counts as k equals 1 memorization. The paper specifically criticizes GPT-2's document-level deduplication as a result, arguing it is insufficient because a string can appear dozens of times within one document, escape deduplication entirely, and be repeated enough times to get fully memorized. The authors demonstrate k equals 1 extraction of personally identifiable information including a real individual's full name, address, phone number, email address, and fax number, and show that memorization scales with model size, GPT-2 XL memorizes 18 times more content than GPT-2 Small, with complete memorization triggering at just 33 repetitions within a single training document. + +There is a lot to appreciate in how this paper is structured and argued. The decision to attack GPT-2 specifically is ethically well-reasoned, and the dual verification process gives the results a level of credibility that is rare in this area. Most related work on memorization relied on artificially inserting canary sequences, fake secrets planted deliberately into training data, and then checking if they leak. The critical limitation of that prior approach is that the researcher already knew what secret to look for. This paper finds memorization of naturally occurring content without knowing in advance what to search for, which is a fundamentally stronger and more realistic threat demonstration. The finding about deleted content being recoverable from the model was one of the more striking results, the idea that GPT-2 functions as an unintentional archive of content that no longer exists on the web is something I had not considered before reading this paper. I also found the contextual integrity examples particularly disturbing. The model combining two completely unrelated memorized fragments into a false narrative about a real person, attributing a 2013 murder to a victim from the 2016 Orlando shooting, is a failure mode that goes beyond simple data leakage into something closer to automated defamation, and it happens without any adversary intending it. The paper also raises the open question of whether fine-tuning a model on task-specific data causes it to forget pre-training memorization or introduce new memorization from the fine-tuning data, a direction that remains unexplored and has significant implications for how deployed models should be audited. + +The finding I found most confusing on first read was the relationship between overfitting and memorization. The paper argues that no overfitting does not mean no memorization, and the reasoning is sound once you understand that the train-test gap is an average measure while memorization is a worst-case phenomenon at the level of individual examples. But the paper takes several pages to make this distinction clear, and an early concrete example of a specific training document with anomalously low loss would have helped ground this before the formal definitions. The paper is also careful throughout to say memorization correlates with certain conditions rather than claiming causation, it explicitly acknowledges that understanding why models memorize is an open question and that its results are observational. This epistemic honesty is appropriate but I initially expected stronger causal claims given how confidently the attack itself is presented. Finally, I initially expected the baseline attack to at least partially surface sensitive content, since the authors present it as a working first attempt. In reality it finds nothing privately sensitive, only widely repeated public content like software licenses and common boilerplate. Calling this a weakness understates the case, the baseline completely fails at the actual goal of recovering private data, which makes the gap between the naive and improved approaches more significant than the paper initially signals. diff --git a/docs/insights/SoK: Science, Security, and the Elusive Goal of Security as a Scientific Pursuit Review.txt b/docs/insights/SoK: Science, Security, and the Elusive Goal of Security as a Scientific Pursuit Review.txt new file mode 100644 index 0000000..bdc84e5 --- /dev/null +++ b/docs/insights/SoK: Science, Security, and the Elusive Goal of Security as a Scientific Pursuit Review.txt @@ -0,0 +1,30 @@ +Title: SoK: Science, Security, and the Elusive Goal of Security as a Scientific Pursuit + +Professor Name: Habeeb Olufowobi +Student Name: Harshwardhan Patil +Student ID: 1002224144 +Date: 04.22.2026 + +Summary + +The paper “Science of Security” presents a systematic effort to define cybersecurity as a rigorous scientific discipline rather than just a collection of ad hoc practices. The authors argue that while fields like physics and mathematics have well-established theoretical foundations, cybersecurity lacks universally accepted principles, formal models, and repeatable experimental methodologies. As a result, security solutions are often reactive, inconsistent, and difficult to validate. The paper emphasizes that security cannot be treated as a binary property (secure vs. insecure), but rather as a probabilistic and evolving state where systems are continuously exposed to new threats. + +A central theme of the paper is the gap between formal security models and real-world systems. The authors highlight that many cryptographic and system-level security models fail to account for practical attack vectors such as side-channel attacks, implementation flaws, and human behavior. For example, even strong cryptographic systems can fail due to leakage or assumptions that do not hold in practice . The paper also discusses the difficulty of defining “correct” security properties, as different stakeholders may have conflicting interpretations of what security means. This leads to inconsistencies in evaluation and implementation. + +Another important contribution of the paper is its focus on measurement and experimentation in security research. Unlike traditional sciences, where experiments can be replicated under controlled conditions, cybersecurity experiments are often difficult to reproduce due to evolving environments, hidden variables, and adversarial behavior. The paper argues for the development of standardized metrics and methodologies to improve the reliability and comparability of security research. It also stresses the importance of hypothesis-driven approaches, suggesting that security decisions should be treated as scientific hypotheses that can be tested and falsified. + +Overall, the paper aims to push the field toward a more disciplined and structured approach, where security research is grounded in theory, validated through experiments, and continuously refined based on empirical evidence. + +Discussion + +One of the most compelling aspects of this paper is its honest critique of the current state of cybersecurity. The authors do not attempt to present security as a mature or fully understood field; instead, they acknowledge its limitations and highlight the challenges in establishing it as a true science. This level of transparency is valuable because it sets realistic expectations for both researchers and practitioners. The idea that “we can never be sure that we are secure, only that we are insecure” is particularly powerful, as it reframes security from a goal to a continuous process. + +A major strength of the paper is its discussion of the disconnect between theoretical models and practical implementations. I found the examples from cryptography especially insightful, where formal proofs often fail to capture real-world attack vectors like side-channel attacks. This highlights a critical issue: even mathematically sound systems can be insecure when deployed. This was somewhat surprising because cryptography is often considered the “most rigorous” area of cybersecurity, yet even it suffers from fundamental limitations. The paper effectively demonstrates that relying solely on formal models can create a false sense of security. + +However, one aspect I did not fully appreciate is the lack of concrete solutions. While the paper clearly identifies the problems in the field, it remains somewhat abstract when proposing solutions. For instance, it advocates for better metrics and experimental rigor but does not provide detailed frameworks or methodologies for achieving these goals. Including case studies or examples of successful scientific approaches in cybersecurity would have strengthened the paper significantly. + +Another area that could have been improved is clarity in certain sections. The discussion around security definitions and models can become dense and difficult to follow, especially for readers who are not already familiar with the field. For example, the debate over “correct” definitions of security is important but could have benefited from simpler examples or visual representations. At times, the paper assumes a high level of prior knowledge, which may limit its accessibility to beginners. + +One point that I found particularly thought-provoking is the emphasis on assumptions in security systems. The paper suggests that many vulnerabilities arise not from flaws in design but from incorrect or incomplete assumptions about the environment or adversary. This aligns closely with real-world incidents, where systems fail because they were not designed with the correct threat model in mind. It reinforces the idea that understanding the attacker is just as important as building defenses. + +Overall, this paper provides a critical and reflective perspective on the field of cybersecurity. It challenges the reader to think beyond tools and techniques and consider the foundational principles that govern security. While it does not provide definitive answers, it successfully highlights the need for a more structured, scientific approach to security research. The key takeaway is that cybersecurity is still an evolving discipline, and developing it into a true science requires better models, better measurements, and a deeper understanding of real-world systems. \ No newline at end of file diff --git a/docs/insights/aws-cost-audit.md b/docs/insights/aws-cost-audit.md new file mode 100644 index 0000000..6a35836 --- /dev/null +++ b/docs/insights/aws-cost-audit.md @@ -0,0 +1,692 @@ +# AWS Cost Audit — Infrastructure Inventory & Reduction Plan + +**Audited:** 2026-04-02 +**Last updated:** 2026-04-03 +**Region:** us-east-1 +**Account:** (redacted) + +--- + +## TL;DR — Current Status + +| | Amount | +| --- | --- | +| Feb 2026 actual | ~$2.93 | +| Mar 2026 actual | ~$44.16 (inc. tax) | +| Apr 2026 projected at start of audit | ~$81/month | +| **Apr 2026 projected now (after actions taken)** | **~$52/month** | +| Saved so far | **~$29/month** | +| Actions completed | Release idle EIP + TinyURL EC2 downsize + Night scheduler | + +The bill spike is entirely caused by deploying TinyURL as a separate production-grade AWS stack on **March 26, 2026**. +No NAT gateway, no ECS, no Lambda — just always-on infrastructure with no traffic to justify it. + +--- + +## Complete Infrastructure Inventory (current state) + +### EC2 Instances + +| Name | ID | Type | AZ | Public IP | Cost/month | +| --- | --- | --- | --- | --- | --- | +| ems-prod-app | `` | t2.micro | us-east-1e | EIP (static, permanent) | $8.35 | +| tinyurl-prod | `` | **t3.micro** | us-east-1a | auto-assigned | **$7.49** | + +**ems-prod-app** — Runs 4 Docker services: Spring Boot (EMS app) + PostgreSQL + Redis + Nginx. +Migrated from RDS to Docker Postgres in March 2026. DNS: `ems.buffden.com` → Elastic IP (permanent, assigned 2026-04-02). + +**tinyurl-prod** — Runs TinyURL Spring Boot app only. DB is on RDS. EC2 security group accepts +traffic from the ALB only — direct access is blocked even though a public IP is assigned. + +**AZ lock-in:** EMS is in `us-east-1e`, which only supports the t2 family. t3, t3a, and t4g are +not available there. EMS cannot be downsized further without re-launching it in a different AZ. + +### EBS Volumes + +| Volume ID | Size | Type | Attached To | +| --- | --- | --- | --- | +| `` | 8 GB | gp3 | ems-prod-app | +| `` | 20 GB | gp2 | tinyurl-prod | + +No unattached volumes. No orphaned snapshots. + +### RDS Instances + +| Identifier | Class | Engine | Multi-AZ | Storage | Cost/month | +| --- | --- | --- | --- | --- | --- | +| tinyurl-prod | db.t4g.micro | PostgreSQL 17.4 | No | 20 GB gp3 | $13.70 | + +Backup retention: 7 days. Performance Insights: enabled (free at this tier). No read replicas. +The old EMS RDS (db.t3.micro) was deleted after the Docker Postgres migration in March 2026. + +### Load Balancers + +| Name | Type | State | Created | Cost/month | +| --- | --- | --- | --- | --- | +| tinyurl-alb | application | active | Mar 26 2026 | $14.01 | + +Target group: `tg-tinyurl-api` → HTTP:80 → instance-type → `tinyurl-prod` EC2. +DNS for TinyURL API: `go.buffden.com` → ALB DNS (auto-generated by AWS) + +### Elastic IPs + +| IP | Allocation ID | Status | Attached To | Cost/month | +| --- | --- | --- | --- | --- | +| `` | `` | In-use | ALB ENI in tinyurl-public-1a | $3.65 | +| `` | `` | In-use | ALB ENI in tinyurl-public-1b | $3.65 | +| ~~(released)~~ | ~~(released)~~ | **RELEASED 2026-04-02** | — | $0 | + +Both active EIPs are attached to the ALB's ENIs (one per AZ). Deleting the ALB also frees these. + +### VPCs and Networking + +| VPC ID | CIDR | Name | Internet Gateway | +| --- | --- | --- | --- | +| `` | 172.30.0.0/16 | EMS VPC (unnamed) | `` | +| `` | 10.0.0.0/16 | tinyurl-prod-vpc | `` | + +TinyURL VPC subnets: + +| Subnet | CIDR | AZ | Role | +| --- | --- | --- | --- | +| tinyurl-public-1a | 10.0.1.0/24 | us-east-1a | EC2 lives here | +| tinyurl-public-1b | 10.0.2.0/24 | us-east-1b | ALB second node | +| tinyurl-private-1a | 10.0.3.0/24 | us-east-1a | RDS lives here | +| tinyurl-private-1b | 10.0.4.0/24 | us-east-1b | RDS subnet group (unused) | + +No NAT Gateways exist anywhere. EC2 reaches the internet directly via the internet gateway. + +### Route 53 + +One hosted zone: `buffden.com` + +| Record | Type | Value | +| --- | --- | --- | +| ems.buffden.com | A | EIP (assigned 2026-04-02, permanent) | +| go.buffden.com | A (alias) | ALB DNS (auto-generated) | +| tinyurl.buffden.com | A | CloudFront domain (SPA) | +| portfolio.buffden.com | CNAME | buffden.github.io | + +### S3 Buckets (negligible cost) + +| Bucket | Created | Purpose | +| --- | --- | --- | +| emsbuffdens3 | Dec 2025 | EMS assets | +| ems-logs-export-20260310 | Mar 2026 | Log export (one-time) | +| tinyurl-spa-prod | Mar 2026 | TinyURL React SPA | + +### CloudWatch Log Groups + +| Log Group | Stored | Retention | +| --- | --- | --- | +| /tinyurl/prod | ~26 MB | 30 days | +| RDSOSMetrics | 0 bytes | 30 days | + +Both have retention set. No runaway log growth. + +### Other Services + +| Service | Status | +| --- | --- | +| ECS | None | +| Lambda | None | +| NAT Gateway | None | +| Secrets Manager | None | +| SSM Parameter Store | 37 params, all Standard tier (free) | + +SSM params: `/ems/prod/*` (28 params) and `/tinyurl/prod/*` + `/tinyurl/cicd/*` (9 params). +Note: `/ems/prod/RDS_ENDPOINT` is stale (left over from before the Docker migration). No cost impact. + +--- + +## April 2026 Bill — Before and After + +| Service | Resource | Original | After Actions | Saved | +| --- | --- | --- | --- | --- | +| EC2 | ems-prod-app (t2.micro) | $8.35 | $8.35 | — | +| EC2 | tinyurl-prod (t3.small → t3.micro) | $14.98 | $7.49 | **$7.49** | +| RDS | tinyurl-prod db.t4g.micro + 20 GB | $13.70 | $13.70 | — | +| ALB | tinyurl-alb | $14.01 | $14.01 | — | +| Public IPv4 | ALB EIP (public-1a) | $3.65 | $3.65 | — | +| Public IPv4 | ALB EIP (public-1b) | $3.65 | $3.65 | — | +| Public IPv4 | Idle EIP (released) | $3.65 | $0 | **$3.65** | +| Public IPv4 | EMS auto-IP | $3.65 | $3.65 | — | +| Public IPv4 | TinyURL auto-IP | $3.65 | $3.65 | — | +| EBS | 8 GB gp3 + 20 GB gp2 | $1.50 | $1.50 | — | +| Route 53 | 1 hosted zone | $0.54 | $0.54 | — | +| Tax | ~10% | ~$7.50 | ~$6.50 | — | +| **Total** | | **~$81** | **~$70** | **$11.14** | + +--- + +## Actions Taken + +### Action 1 — Release idle EIP ✅ DONE 2026-04-02 + +**Saved: $3.65/month** + +Idle EIP released. +Pre-release checks confirmed: not referenced in Route 53, SSM parameters, or any ENI. + +### Action 2 — Downsize TinyURL EC2 ✅ DONE 2026-04-02 + +**Saved: $7.49/month** (t3.small $14.98 → t3.micro $7.49) + +- Stopped both EC2 instances, changed TinyURL to t3.micro, restarted. +- EMS downgrade to t3.micro **failed** — `us-east-1e` only supports t2 family (see AZ lock-in note above). + EMS reverted to t2.micro and restarted successfully. +- EMS received new auto-IP on restart. Route 53 updated immediately. +- ALB target health confirmed healthy after TinyURL restart. + +--- + +### Action 3 — Night Scheduler (Lambda + EventBridge + DLQ + CloudWatch) ✅ DONE 2026-04-03 + +**Estimated savings: ~$18/month** (weeknights + weekends off) + +Implemented the production-grade Option C2 scheduler described in the Option C section below. + +#### What was built + +| Component | Purpose | +| --- | --- | +| Lambda function (Python/boto3) | Stops and starts both EC2 instances + TinyURL RDS | +| EventBridge Scheduler — nightly stop | Fires every day at 11 PM CDT (04:00 UTC) | +| EventBridge Scheduler — weekday start | Fires Mon–Fri at 7 AM CDT (12:00 UTC) | +| SQS Dead Letter Queue | Captures failed Lambda invocations for inspection | +| CloudWatch Log Group (30-day retention) | Full execution history for every stop/start run | +| CloudWatch Alarm | Monitors Lambda errors; publishes to SNS on failure | +| SNS Topic | Alert channel for scheduler failures | +| IAM roles | Least-privilege: Lambda can stop/start only the specific instances | + +#### Schedule + +| Event | When (CDT) | When (UTC) | Cron expression | +| --- | --- | --- | --- | +| Stop all | Every day 11 PM | 04:00 UTC next day | `cron(0 4 * * ? *)` | +| Start all | Mon–Fri 7 AM | 12:00 UTC | `cron(0 12 ? * MON-FRI *)` | + +Weekends: instances stop Friday 11 PM and do not restart until Monday 7 AM (60h off). + +#### Result + +- **Effective uptime:** Mon–Fri 7 AM–11 PM CDT (80h/week out of 168h = **48% uptime**) +- **Instances affected:** both EC2s + TinyURL RDS +- **Tested successfully:** Lambda invoked manually, both EC2s transitioned to stopped state, + RDS stop initiated, execution logged to CloudWatch Logs (~5 seconds per invocation) + +#### Theory + +The scheduler uses a four-layer observability and reliability pattern standard in production AWS shops: + +**EventBridge Scheduler** (launched 2022) handles the cron timing with timezone awareness. +It calls Lambda directly on the configured schedule. Unlike the older EventBridge Rules service, +Scheduler has a persistent schedule store, supports `at()` one-time expressions, and can invoke +AWS SDK APIs directly without Lambda if needed. + +**Lambda** runs the stop/start logic with no idle cost. Python boto3 calls `ec2.stop_instances`, +`ec2.start_instances`, `rds.stop_db_instance`, and `rds.start_db_instance`. Total runtime per +invocation is under 10 seconds — well within Lambda's 15-minute limit. At ~120 invocations/month +the cost is $0 (covered by the 1M request free tier). + +**Dead Letter Queue (DLQ)** — an SQS queue attached to the Lambda function. If the function +throws an unhandled exception (e.g. API throttle, IAM permission error), AWS routes the failed +event payload to the DLQ instead of silently dropping it. Messages stay in the queue for 4 days +and can be replayed once the root cause is fixed. + +**CloudWatch Logs** — every Lambda invocation automatically writes structured logs to a dedicated +log group. You can see exactly which instances were stopped, timestamps, and any errors without +SSH access. Retention set to 30 days to stay within the 5 GB CloudWatch free tier. + +**CloudWatch Alarm + SNS** — the alarm watches the Lambda error metric. If errors > 0 in a +5-minute window, the alarm transitions to ALARM state and publishes to an SNS topic. The topic +sends an email alert so that a missed stop (and the resulting billing) does not go unnoticed. + +#### Cost + +All five services (Lambda, EventBridge, SQS, CloudWatch, SNS) stay within free tier at +~120 invocations/month. Net additional cost: **$0/month**. Net saving: **~$18/month**. + +--- + +## Remaining Reduction Opportunities + +### Option A — Remove TinyURL ALB + release its 2 EIPs + +**Would save: $14.01 (ALB) + $7.30 (2 EIPs) - $3.65 (new EC2 EIP needed) = $17.66/month** + +The ALB is currently the only entry point to the TinyURL EC2 — the EC2 security group only allows inbound from the ALB security group. To remove it safely: + +1. Add HTTP/HTTPS rules directly to the EC2 security group: + +```bash +aws ec2 authorize-security-group-ingress \ + --group-id \ + --protocol tcp --port 443 --cidr 0.0.0.0/0 + +aws ec2 authorize-security-group-ingress \ + --group-id \ + --protocol tcp --port 80 --cidr 0.0.0.0/0 +``` + +2. Assign a static EIP to the TinyURL EC2 (so its IP does not change on stop/start). + +3. Set up HTTPS on the EC2 directly using Certbot and Nginx (same pattern as EMS). + +4. Update Route 53 `go.buffden.com` to point to the EC2 EIP. + +5. Update SSM params `/tinyurl/prod/tinyurl.frontend-url` and `/tinyurl/prod/tinyurl/base-url` + if they referenced the ALB DNS name. + +6. Check the CloudFront distribution for `tinyurl.buffden.com` — update its origin if it pointed + at the ALB. + +7. Delete the ALB and release its 2 EIPs: + +```bash +aws elbv2 delete-load-balancer \ + --load-balancer-arn + +aws ec2 release-address --allocation-id +aws ec2 release-address --allocation-id +``` + +**Risk: MEDIUM.** No downtime during setup — switch DNS last. The main risk is HTTPS config and +any SSM params or CloudFront origin that reference the ALB hostname. + +--- + +### Option B — Migrate TinyURL DB from RDS to Docker Postgres on EC2 + +**Would save: $13.70/month** + +We have a proven playbook from the EMS RDS-to-Docker migration (`docs/rds-to-docker-postgres-migration.md`). + +1. Dump from RDS: + +```bash +pg_dump -h -U -d -F c -f tinyurl_backup.dump +``` + +2. SSH into `tinyurl-prod` EC2, start Docker Postgres (same compose pattern as EMS). + +3. Restore the dump. + +4. Update SSM params: + - `/tinyurl/prod/spring/datasource/url` → new Docker Postgres connection string + - `/tinyurl/cicd/rds-endpoint` → mark deprecated + +5. Restart the TinyURL Spring Boot app and verify. + +6. Delete the RDS instance: + +```bash +aws rds delete-db-instance \ + --db-instance-identifier tinyurl-prod \ + --skip-final-snapshot +``` + +**Risk: MEDIUM.** EMS has been running Spring Boot + Docker Postgres + Redis + Nginx on 1 GB RAM +since December 2025 without issues. TinyURL EC2 is now t3.micro (1 GB), same ceiling. Watch memory +after migration before declaring it stable. + +--- + +### Option C — Night Scheduler (stop EC2 + RDS during off-hours) ✅ DONE 2026-04-03 + +See Action 3 above for implementation details. + +**Note:** The ALB charges 24/7 regardless of whether the EC2 is stopped. Option C only saves +on EC2 compute and RDS instance hours. + +--- + +#### Recommended Schedule + +Both projects are personal/portfolio — no users outside of demos. Suggested windows: + +| Window | Stop | Start | Hours off/day | Savings | +| --- | --- | --- | --- | --- | +| **Weeknights only** | 11 PM CDT | 7 AM CDT | 8h | ~$9/month | +| **Weeknights + weekends** | Fri 11 PM CDT | Mon 7 AM CDT | 8h weekdays + 48h weekend | ~$18/month | + +**Weeknights + weekends** is the recommended option for portfolio projects: +- Uptime is **Mon–Fri 7 AM–11 PM CDT** (80h/week out of 168h = 48% uptime) +- Saves ~52% on EC2 and RDS compute hours +- Recruiters and interviewers are never visiting at 2 AM on a Sunday +- If you need it running outside the window, start it manually in 30 seconds via AWS console or CLI + +--- + +#### Scheduler Implementation Options + +Four approaches to implement this — compared by complexity, AWS services used, and resume value: + +| # | Approach | AWS Services | Resume Value | Complexity | +| --- | --- | --- | --- | --- | +| 1 | **EventBridge Scheduler + direct targets** | EventBridge Scheduler, IAM | ⭐⭐⭐⭐ Modern | Low | +| 2 | **Lambda + EventBridge Rules** | Lambda, EventBridge, IAM | ⭐⭐⭐⭐⭐ Classic | Medium | +| 3 | **GitHub Actions scheduled workflows** | GitHub Actions, OIDC | ⭐⭐⭐ DevOps | Low | +| 4 | **SSM Automation Runbooks** | SSM, EventBridge, IAM | ⭐⭐⭐ Enterprise | High | + +--- + +**Option C1 — EventBridge Scheduler with direct targets (already scripted)** + +The `scripts/aws-night-scheduler.sh` script implements this. EventBridge Scheduler calls the +EC2 and RDS APIs directly — no Lambda or intermediate compute needed. + +``` +EventBridge Scheduler → ec2:StopInstances / ec2:StartInstances + → rds:StopDBInstance / rds:StartDBInstance +``` + +- Introduced in 2022 — shows awareness of current AWS services +- No Lambda cold starts, no function code to maintain +- IAM role scoped to specific instance ARNs (least-privilege) +- Resume talking point: *"Used EventBridge Scheduler with direct SDK targets to automate + EC2/RDS stop-start cycles, eliminating Lambda overhead and reducing compute costs by X%"* + +--- + +**Option C2 — Lambda + EventBridge Rules (most resume-visible combination)** + +The classic approach — widely referenced in job descriptions and AWS certifications. +A Lambda function (Python/boto3) is triggered by a CloudWatch/EventBridge cron rule. + +``` +EventBridge Rule (cron) → Lambda (boto3) → EC2 / RDS API +``` + +```python +import boto3 + +def handler(event, context): + ec2 = boto3.client('ec2', region_name='us-east-1') + rds = boto3.client('rds', region_name='us-east-1') + action = event['action'] # 'stop' or 'start' + + if action == 'stop': + ec2.stop_instances(InstanceIds=['', '']) + rds.stop_db_instance(DBInstanceIdentifier='tinyurl-prod') + elif action == 'start': + ec2.start_instances(InstanceIds=['', '']) + rds.start_db_instance(DBInstanceIdentifier='tinyurl-prod') +``` + +- Every AWS Solutions Architect role knows this pattern +- Lambda, EventBridge, IAM, boto3 — all highly searchable on LinkedIn +- Easy to extend: add SNS notification on failure, CloudWatch alarm on missed start +- Resume talking point: *"Built serverless automation with Lambda and EventBridge to schedule + infrastructure stop/start, saving ~$X/month with zero maintenance overhead"* + +--- + +**Option C3 — GitHub Actions scheduled workflows** + +Use a cron-scheduled GitHub Actions workflow with AWS CLI commands. No AWS-side resources needed +beyond an OIDC role. + +```yaml +on: + schedule: + - cron: '0 4 * * *' # 11 PM CDT = 04:00 UTC (stop) + - cron: '0 12 * * 1-5' # 7 AM CDT = 12:00 UTC Mon-Fri (start) +``` + +- Simplest to implement if you already have GitHub Actions CI/CD (EMS does) +- Shows OIDC, GitHub Actions, and AWS CLI integration +- Downside: depends on GitHub Actions availability; if GitHub has an incident, instances stay on +- Resume talking point: *"Extended CI/CD pipeline with scheduled GitHub Actions workflows + for automated infrastructure lifecycle management"* + +--- + +**Option C4 — SSM Automation Runbooks** + +Enterprise-grade approach. SSM Automation documents (runbooks) define multi-step stop/start +procedures that can be triggered by EventBridge. Supports approval steps, notifications, and +rollback logic. + +- Overkill for this scale but highly valued in enterprise/ops roles +- Resume talking point: *"Authored SSM Automation runbooks for scheduled EC2 lifecycle + management with built-in health verification steps"* + +--- + +#### Which to Use + +For this project the script in `scripts/aws-night-scheduler.sh` (Option C1) is already ready to +run. If you want maximum resume visibility, implement Option C2 (Lambda) as well — use it for +the weekend schedule and keep EventBridge Scheduler for the nightly window. That way your +infrastructure demonstrates both patterns. + +--- + +#### Production Standard — Lambda + EventBridge + SNS + DLQ + +The production-grade implementation combines all four services into an observable, failure-aware +pipeline. This is what mature AWS shops use for any scheduled infrastructure automation. + +##### Why this is the standard + +| Concern | How it is addressed | +| --- | --- | +| What if the Lambda fails silently? | Dead Letter Queue (DLQ) captures every failed invocation | +| How do I know if instances didn't stop? | SNS sends an alert email/Slack on Lambda failure | +| How do I debug what happened? | CloudWatch Logs stores the full execution output of every run | +| What if someone changes the schedule wrongly? | Lambda code is version-controlled in git, not a console click | +| Is the IAM permission scope safe? | IAM role scoped to specific instance ARNs — least privilege | +| Can I trigger it manually without touching cron? | AWS console → Lambda → Test, or `aws lambda invoke` from CLI | + +##### Architecture + +``` +EventBridge Scheduler (cron expression) + │ + ▼ + Lambda Function (Python/boto3) + ┌─────────────────────────────┐ + │ 1. Log invocation start │ + │ 2. Call EC2/RDS API │ + │ 3. Verify state changed │ + │ 4. Log result │ + └─────────────────────────────┘ + │ │ + Success Failure + │ │ + CloudWatch DLQ (SQS) + Logs + SNS → alert +``` + +##### Key concepts + +**EventBridge Scheduler** — AWS's dedicated scheduling service (launched 2022). Supports cron +and rate expressions, timezone-aware schedules, and can invoke Lambda, Step Functions, or AWS +SDK APIs directly. Unlike the older EventBridge Rules, it has a persistent schedule store and +supports one-time `at()` expressions for manual one-off triggers. + +**Lambda** — Serverless function that runs your stop/start logic. No servers to manage, no +idle cost. At 120 invocations/month the cost is $0 (well within the 1M request free tier). +The function receives the schedule event, calls the AWS SDK (boto3), logs the result, and +exits. Total runtime per invocation: ~3–5 seconds. + +**Dead Letter Queue (DLQ)** — An SQS queue attached to the Lambda function. If Lambda +throws an unhandled exception (e.g. API throttle, permission error), AWS automatically sends +the failed event to the DLQ instead of dropping it. You can inspect the DLQ to see exactly +what went wrong and replay the message once the issue is fixed. + +**SNS (Simple Notification Service)** — A CloudWatch Alarm monitors the Lambda's error +metric. If errors > 0, the alarm fires and publishes to an SNS topic which sends an email +(or Slack webhook). This means you get notified if your instances fail to stop — before you +see a surprise bill. + +**CloudWatch Logs** — Every Lambda invocation writes structured logs to a dedicated log +group. You can query execution history, see which instances were stopped, and diagnose +failures without SSHing into anything. + +##### Cost at this scale + +| Service | Free tier | Monthly usage | Cost | +| --- | --- | --- | --- | +| EventBridge Scheduler | 14M invocations free | ~120 | $0 | +| Lambda | 1M requests + 400K GB-seconds free | ~120 requests, ~75 GB-seconds | $0 | +| CloudWatch Logs | 5 GB ingestion free | ~120 KB | $0 | +| SNS | 1M publishes + 1K emails free | ~0 (only on failure) | $0 | +| SQS (DLQ) | 1M requests free | ~0 (only on failure) | $0 | +| **Total** | | | **$0/month** | + +##### Manual trigger options + +```bash +# One-time stop right now (without waiting for the cron schedule) +aws lambda invoke \ + --function-name ems-night-scheduler \ + --payload '{"action":"stop"}' \ + --cli-binary-format raw-in-base64-out \ + response.json + +# One-time start +aws lambda invoke \ + --function-name ems-night-scheduler \ + --payload '{"action":"start"}' \ + --cli-binary-format raw-in-base64-out \ + response.json +``` + +Or from the AWS console: Lambda → select function → Test tab → create a test event with +`{"action": "stop"}` → click Test. Works from any browser including mobile. + +##### Resume talking points + +- *"Designed serverless infrastructure scheduler using Lambda, EventBridge, SNS, SQS DLQ, + and CloudWatch — reducing monthly EC2/RDS costs by ~52% with full observability and + zero operational overhead"* +- *"Implemented least-privilege IAM roles scoped to specific resource ARNs for scheduled + automation functions"* +- *"Built failure alerting pipeline using CloudWatch Alarms → SNS → email for unattended + infrastructure jobs"* + +--- + +## Projected Bill for Each Scenario + +| Scenario | Actions | Monthly Bill | vs Original | +| --- | --- | --- | --- | +| Baseline (before audit) | none | ~$81 | — | +| After Action 1 + 2 | idle EIP + EC2 downsize | ~$70 | -13% | +| **Current state** | + Night scheduler (Action 3) | **~$52** | **-36%** | +| + Night scheduler | current + Option C | ~$52 | -36% | +| + DB migration | current + Option B | ~$56 | -31% | +| + DB migration + scheduler | current + B + C | ~$47 | -42% | +| + ALB removal | current + Option A | ~$52 | -36% | +| + ALB + DB migration | current + A + B | ~$38 | -53% | +| + ALB + DB + scheduler | current + A + B + C | ~$29 | **-64%** | + +Crossing 50% reduction requires removing either the ALB or the RDS — keeping both caps savings at ~42%. + +--- + +## EC2 Instance Downgrade Reference + +### What Is Running on Each Instance + +#### ems-prod-app (t2.micro — 1 vCPU, 1 GB RAM, us-east-1e) + +| Service | Container | Est. RAM | +| --- | --- | --- | +| Spring Boot (EMS app) | app | ~400–500 MB | +| PostgreSQL 17 | postgres | ~100–150 MB | +| Redis | redis | ~30–50 MB | +| Nginx | nginx | ~5–10 MB | +| OS + Docker daemon | — | ~100–130 MB | +| **Total** | | **~635–840 MB / 1024 MB** | + +Tight but stable since December 2025. Approx 62–82% memory at idle. + +#### tinyurl-prod (t3.micro — 2 vCPU, 1 GB RAM, us-east-1a) + +| Service | Est. RAM | +| --- | --- | +| Spring Boot (TinyURL app) | ~350–450 MB | +| Nginx | ~5–10 MB | +| OS + Docker daemon | ~100–130 MB | +| **Total** | **~455–590 MB / 1024 MB** | + +Comfortable at ~45–57% memory. If Docker Postgres is added later (Option B), this becomes +the same pattern as EMS — proven to work but with limited headroom. + +### CPU Utilization — Last 5 Days (Mar 28 – Apr 1) + +#### ems-prod-app + +| Date | Avg CPU | Peak CPU | +| --- | --- | --- | +| Mar 28 | 2.87% | 4.67% | +| Mar 29 | 2.92% | 4.92% | +| Mar 30 | 2.92% | 5.67% | +| Mar 31 | 2.93% | 5.00% | +| Apr 1 | 11.34% | 100% | + +The 100% spike on Apr 1 is a deployment restart (JVM warm-up + Flyway migrations). Normal +operating average is a stable ~3%. + +#### tinyurl-prod + +| Date | Avg CPU | Peak CPU | +| --- | --- | --- | +| Mar 28 | 0.60% | 19.4% | +| Mar 29 | 0.73% | 29.2% | +| Mar 30 | 0.54% | 7.8% | +| Mar 31 | 0.53% | 11.3% | +| Apr 1 | 0.54% | 16.1% | + +Average below 1% every day. Peaks are CI/CD deployments. Highly underutilised. + +### Available Instance Types by AZ + +**us-east-1e (where EMS lives) — t2 family only:** + +| Type | vCPU | RAM | Cost/month | Notes | +| --- | --- | --- | --- | --- | +| t2.micro *(current)* | 1 | 1 GB | $8.35 | Cheapest viable option in this AZ | +| t2.small | 1 | 2 GB | $16.70 | More expensive, no benefit | +| t2.nano | 1 | 0.5 GB | $4.18 | Too little RAM for 4 services | + +EMS is already on the cheapest viable instance type for its AZ. +To use t3/t3a/t4g, the instance would need to be re-launched in a different AZ — not worth +it for a saving of $0.86/month. + +**us-east-1a (where TinyURL lives) — full t-family support:** + +| Type | vCPU | RAM | Cost/month | Saves vs t3.micro | Architecture | +| --- | --- | --- | --- | --- | --- | +| t3.micro *(current)* | 2 | 1 GB | $7.49 | — | x86 | +| t3a.micro | 2 | 1 GB | $6.77 | $0.72 | x86 AMD | +| t4g.micro | 2 | 1 GB | $6.05 | $1.44 | ARM (needs ARM Docker images) | +| t3.nano | 2 | 0.5 GB | $3.74 | — | Too little RAM | + +t3.micro is the recommended stopping point for TinyURL unless ARM Docker images are built. +The additional saving from t3a.micro ($0.72/month) or t4g.micro ($1.44/month) is marginal. + +### Will It Crash? Stability Verdict + +| Change | Memory headroom | CPU headroom | Verdict | +| --- | --- | --- | --- | +| TinyURL t3.small → t3.micro (done) | ~410–545 MB free | 99%+ free | Safe | +| EMS t2.micro → t3.micro | Not possible in us-east-1e | — | Blocked by AZ | +| TinyURL add Docker Postgres (Option B) | ~185–390 MB free (tight) | 99%+ free | Same as EMS today — proven | +| TinyURL t3.micro → t4g.micro | Same RAM, ARM only | Same | Safe if ARM images built | + +--- + +## What Not to Touch + +| Resource | Reason | +| --- | --- | +| EMS t2.micro | Cheapest available in us-east-1e | +| EBS volumes | Both in use, correct sizes | +| Route 53 hosted zone | $0.54/month, required | +| S3 buckets | Negligible cost, active use | +| CloudWatch log groups | Retention already set, no cost issue | +| SSM Parameter Store | All Standard tier (free) | diff --git a/docs/insights/aws-db-analysis.md b/docs/insights/aws-db-analysis.md new file mode 100644 index 0000000..c8b00f3 --- /dev/null +++ b/docs/insights/aws-db-analysis.md @@ -0,0 +1,299 @@ +# RDS Database Size Analysis — tinyurl-prod + +**Created:** 2026-04-10 +**Instance:** `tinyurl-prod` — db.t4g.micro — PostgreSQL 17.4 — us-east-1a +**Status:** Active (running Mon–Fri 7 AM – 11 PM CDT, stopped overnight/weekends) + +--- + +## TL;DR + +| | Value | +| --- | --- | +| Allocated storage | 20 GiB gp3 | +| Auto-scale threshold | 25 GiB | +| Alert status | WARNING — emitting "approaching threshold" every 2 hours since Apr 3 | +| Backup window | 09:14–09:44 UTC ← **conflicts with nightly stop at 04:00 UTC** | +| Backup retention | 7 days | +| Automated snapshots | 8 (all 20 GB each, incremental) | +| Manual snapshots | **0** | +| Estimated monthly storage cost | ~$2.30/month (20 GB × $0.115/GB) | + +**Immediate actions needed:** +1. Fix the backup window — it fires during the stopped period +2. Investigate what is consuming the 20 GB before auto-scale triggers +3. Take a manual snapshot before any RDS deletion (Option B) + +--- + +## Storage Alert Details + +RDS has been emitting the following event every 2 hours since at least 2026-04-03: + +``` +Storage size 20 GiB is approaching the maximum storage threshold 25 GiB. +Increase the maximum storage threshold. +``` + +This is a pre-scale warning. AWS will automatically expand the volume when storage usage +approaches the allocated size. Once expanded to 25 GiB, the allocation stays at 25 GiB +even if data is deleted — RDS does not shrink storage automatically. + +**Cost impact of auto-scale triggering:** + +| | Before | After auto-scale | +| --- | --- | --- | +| Allocated storage | 20 GiB | 25 GiB | +| Storage cost | $2.30/month | $2.88/month | +| Monthly delta | — | +$0.58/month | + +Minor cost impact, but the real concern is that **you are running close to capacity on a +personal project**. If the DB keeps growing it will need another scale event. + +--- + +## Backup Window Conflict with Night Scheduler + +**This is the primary operational issue to fix.** + +### Current configuration + +| Setting | Value | Problem? | +| --- | --- | --- | +| Backup window | `09:14–09:44 UTC` | Yes — falls in stopped period | +| Scheduler stop | `04:00 UTC` daily | RDS stopped before backup window | +| Scheduler start | `12:00 UTC` Mon–Fri | RDS restarts after backup window | +| Weekend state | Stopped Fri 04:00 UTC → Mon 12:00 UTC | Backup pending ~56 hours | + +The backup window (09:14 UTC) falls between the stop (04:00 UTC) and start (12:00 UTC). +Every day, the backup is scheduled during the stopped period. + +### What AWS actually does + +**On weekdays:** AWS defers the backup until the instance is running. Backups shift to the +next available window during the active period. Confirmed: after the scheduler was deployed, +weekday backups now happen at ~21:48 UTC (1:48 PM CDT) instead of 09:19 UTC. + +**On weekends:** The instance stays stopped from Fri 04:00 UTC to Mon 12:00 UTC (~56 hours). +AWS will auto-restart the RDS to take the backup rather than miss the backup for more than +~12–18 hours. + +### Observed weekend auto-restart (2026-04-04, Saturday) + +``` +2026-04-04T04:08:39 UTC — DB instance stopped (by night scheduler) +2026-04-04T09:14 UTC — backup window fires (deferred — instance stopped) +... +2026-04-04T21:43:29 UTC — "Recovery of the DB instance has started" (AWS auto-restart) +2026-04-04T21:43:59 UTC — "Recovery of the DB instance has started" (retry) +2026-04-04T21:44:29 UTC — "Recovery of the DB instance has started" (retry) +2026-04-04T21:46:12 UTC — DB instance restarted +2026-04-04T21:47:16 UTC — DB instance started +2026-04-04T21:48:20 UTC — Backing up DB instance +2026-04-04T21:50:30 UTC — Finished DB Instance backup +2026-04-04T22:19:05 UTC — DB instance stopped (by second stop attempt or manual) +``` + +AWS waited ~17.5 hours after the backup window before auto-restarting. The backup took +~2 minutes, then the instance was stopped again. This adds ~36 minutes of RDS runtime on +weekends with no value. + +### Fix + +Move the backup window to the middle of the active running period so it never conflicts: + +```bash +aws rds modify-db-instance \ + --db-instance-identifier tinyurl-prod \ + --preferred-backup-window "18:00-18:30" \ + --apply-immediately +``` + +`18:00–18:30 UTC` = 1:00–1:30 PM CDT. This is the midpoint of the Mon–Fri running window +(12:00–04:00 UTC). Backups will reliably run at 1 PM CDT on weekdays. + +On **weekends**, the instance is stopped the whole time, so AWS will still auto-restart it +once to take a backup. This is unavoidable without disabling automated backups entirely. +The auto-restart will happen in the afternoon CDT (matching the backup window schedule) and +only lasts ~5 minutes. + +--- + +## Automated Snapshot Inventory + +8 automated snapshots exist as of 2026-04-10. All are 20 GB each (nominal allocation; +actual incremental disk usage is much smaller after the first snapshot). + +| Snapshot ID | Created (UTC) | Size | Type | +| --- | --- | --- | --- | +| `rds:tinyurl-prod-2026-03-29-09-19` | 2026-03-29 09:19 | 20 GB | automated | +| `rds:tinyurl-prod-2026-03-30-09-19` | 2026-03-30 09:19 | 20 GB | automated | +| `rds:tinyurl-prod-2026-03-31-09-19` | 2026-03-31 09:19 | 20 GB | automated | +| `rds:tinyurl-prod-2026-04-01-09-19` | 2026-04-01 09:19 | 20 GB | automated | +| `rds:tinyurl-prod-2026-04-02-09-19` | 2026-04-02 09:19 | 20 GB | automated | +| `rds:tinyurl-prod-2026-04-04-21-48` | 2026-04-04 21:48 | 20 GB | automated | +| `rds:tinyurl-prod-2026-04-06-21-48` | 2026-04-06 21:48 | 20 GB | automated | +| `rds:tinyurl-prod-2026-04-08-21-48` | 2026-04-08 21:48 | 20 GB | automated | + +**Note on the Apr 4 and later backups:** After the scheduler was deployed (Apr 3), the +backup window conflict caused the backup to be deferred and taken at 21:48 UTC instead of +09:19 UTC. The backup time shift from morning to evening is visible in the snapshot IDs. + +### Snapshot storage cost + +AWS includes automated backup storage **equal to 100% of your DB's allocated storage** for +free. For a 20 GB DB, the first 20 GB of snapshot storage is free. + +Automated snapshots are block-level incremental after the first full snapshot, so 8 snapshots +of a 20 GB DB does not cost 160 GB — it costs 20 GB (initial) + incremental changes only. +For a low-write portfolio DB, the incremental changes are likely under 1–2 GB total. + +**Current estimated snapshot storage cost: ~$0/month** (within free tier). + +### No manual snapshots exist + +Zero manual snapshots. This is a risk: if the RDS instance is deleted (Option B), automated +snapshots are deleted with it unless a final snapshot is explicitly requested. + +--- + +## DB Size Investigation + +To understand what is consuming the 20 GB, connect via SSM + psql and run: + +```bash +# Start SSM session to the tinyurl EC2 instance +aws ssm start-session --target i-0bfdc622bac423b96 + +# Inside the session — connect to RDS (get endpoint from SSM param) +RDS_HOST=$(aws ssm get-parameter --name /tinyurl/prod/spring/datasource/url --query Parameter.Value --output text | grep -oP '(?<=//)[^:/]+') +psql -h $RDS_HOST -U -d +``` + +Once connected, run these queries: + +```sql +-- Total database size +SELECT pg_size_pretty(pg_database_size(current_database())) AS db_size; + +-- Size by table (top 10) +SELECT + schemaname, + tablename, + pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size, + pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size, + pg_size_pretty(pg_indexes_size(schemaname||'.'||tablename)) AS index_size +FROM pg_tables +WHERE schemaname NOT IN ('pg_catalog', 'information_schema') +ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC +LIMIT 10; + +-- Bloat check — tables with dead tuples +SELECT + relname AS table, + n_live_tup AS live_rows, + n_dead_tup AS dead_rows, + ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 1) AS dead_pct, + last_autovacuum, + last_autoanalyze +FROM pg_stat_user_tables +ORDER BY n_dead_tup DESC +LIMIT 10; + +-- WAL and transaction log usage +SELECT + pg_size_pretty(sum(size)) AS wal_size +FROM pg_ls_waldir(); + +-- Check for large TOAST values (overflow storage for large text/bytea) +SELECT + c.relname AS table, + t.relname AS toast_table, + pg_size_pretty(pg_total_relation_size(t.oid)) AS toast_size +FROM pg_class c +JOIN pg_class t ON c.reltoastrelid = t.oid +WHERE pg_total_relation_size(t.oid) > 1024 * 1024 +ORDER BY pg_total_relation_size(t.oid) DESC; +``` + +### Common causes in a Spring Boot + Flyway DB + +| Cause | How to check | How to fix | +| --- | --- | --- | +| URL table has millions of rows | `SELECT count(*) FROM urls` | Archive old/unused rows | +| Dead tuples (table bloat) | Dead rows query above | `VACUUM FULL ` | +| Index bloat | `pg_indexes_size` in size query | `REINDEX TABLE
` | +| Flyway migration history | `SELECT count(*) FROM flyway_schema_history` | Delete old entries if not needed | +| WAL accumulation | `pg_ls_waldir()` query | Usually self-cleaning; check if replication slots stuck | +| pg_wal directory | N/A if no replicas | No replication slots = no WAL retention issue | + +--- + +## Recommended Action Plan + +### Immediate (this week) + +1. **Fix backup window** — prevents weekend auto-restarts + ```bash + aws rds modify-db-instance \ + --db-instance-identifier tinyurl-prod \ + --preferred-backup-window "18:00-18:30" \ + --apply-immediately + ``` + +2. **Reduce backup retention to 1 day** — no need for 7-day history on a personal project + ```bash + aws rds modify-db-instance \ + --db-instance-identifier tinyurl-prod \ + --backup-retention-period 1 \ + --apply-immediately + ``` + +3. **Investigate DB size** — use SSM + psql queries above to find what is using the 20 GB + +### Before option B (RDS deletion) + +4. **Take a final manual snapshot** + ```bash + aws rds create-db-snapshot \ + --db-instance-identifier tinyurl-prod \ + --db-snapshot-identifier tinyurl-prod-final-before-docker-migration + + aws rds wait db-snapshot-completed \ + --db-snapshot-identifier tinyurl-prod-final-before-docker-migration + ``` + +5. **Dump the DB to a file as a second safety net** + ```bash + pg_dump -h -U -d \ + -F c -f ~/tinyurl_prod_$(date +%Y%m%d).dump + ``` + +6. **Delete instance with skip-final-snapshot** (since you already have both above) + ```bash + aws rds delete-db-instance \ + --db-instance-identifier tinyurl-prod \ + --skip-final-snapshot + ``` + +7. **After Docker Postgres is stable for 2 weeks**, delete the manual snapshot: + ```bash + aws rds delete-db-snapshot \ + --db-snapshot-identifier tinyurl-prod-final-before-docker-migration + ``` + +--- + +## Storage Cost Reference + +| Item | Rate | Current cost | +| --- | --- | --- | +| gp3 storage | $0.115/GiB/month | $2.30/month (20 GB) | +| gp3 IOPS above 3000 baseline | $0.02/IOPS/month | $0 (using baseline) | +| gp3 throughput above 125 MB/s | $0.04/MiB/month | $0 (using baseline) | +| Automated snapshot storage | Free up to DB size | $0 (within 20 GB free) | +| Manual snapshot storage | $0.095/GiB/month | $0 (none exist) | +| **Total storage** | | **~$2.30/month** | + +If RDS is deleted (Option B), all of the above goes to $0 and is replaced by the existing +20 GB EBS volume on the EC2 (already paid for at $1.00/month for the 20 GB gp2 volume). diff --git a/docs/insights/funny-findings.md b/docs/insights/funny-findings.md new file mode 100644 index 0000000..fb12e25 --- /dev/null +++ b/docs/insights/funny-findings.md @@ -0,0 +1,74 @@ +# TinyURL — Funny Findings Log + +A running log of suspicious, hilarious, or otherwise interesting things discovered in production. + +--- + +## #1 — The Mystery URL from Patna + +**Date discovered:** 2026-04-10 +**DB Row ID:** 1033 +**Short code:** `0000Gf` (becomes `Gf` after the leading-zero fix) + +### The URL + +``` +https://harshlatrinemehai.com +``` + +A 10-year short URL. For a domain whose name translates roughly to something unprintable in polite company. Set to expire **April 5, 2036**. + +### Full DB Record + +| Field | Value | +|---|---| +| `id` | 1033 | +| `short_code` | `0000Gf` | +| `original_url` | `https://harshlatrinemehai.com` | +| `created_at` | `2026-04-08 23:51:46 UTC` | +| `expires_at` | `2036-04-05 00:00:00 UTC` | +| `has_explicit_expiry` | `true` | + +### Who Did This + +Traced via CloudWatch access logs (`/tinyurl/prod`): + +``` +49.47.134.46 - - [08/Apr/2026:23:51:46 +0000] +"POST /api/urls HTTP/1.1" 201 167 +"https://tinyurl.buffden.com/" +"Mozilla/5.0 (iPhone; CPU iPhone OS 18_6_2 like Mac OS X) + AppleWebKit/605.1.15 (KHTML, like Gecko) + Mobile/15E148 [LinkedInApp]/9.31.9671" +``` + +| Field | Value | +|---|---| +| **IP** | `49.47.134.46` | +| **City** | Patna, Bihar, India | +| **ISP** | Reliance Jio Infocomm Limited (AS55836) | +| **Device** | iPhone (iOS 18.6.2) | +| **App** | LinkedIn app (opened tinyurl.buffden.com from within LinkedIn) | +| **Time (local)** | 5:21 AM IST, April 9 2026 | +| **Coordinates** | 25.5941°N, 85.1356°E | + +### Timeline + +- **5:21 AM IST** — Someone in Patna, Bihar is awake, on LinkedIn, on their iPhone, on Jio mobile data +- They somehow land on `tinyurl.buffden.com` +- They deliberately type `harshlatrinemehai.com` as the URL to shorten +- They select a **10-year expiry** — this was not an accident +- They hit submit +- `go.buffden.com/0000Gf` now redirects to a toilet-named domain until 2036 + +### Notes + +- Dynamic Jio IP — cannot narrow down to a specific individual +- The domain name appears to be a Hindi insult/joke +- Whoever this is, they planned ahead (10 years) +- The app had no authentication at this time — open to anyone with the link +- Prior to this successful creation, IP `64.189.4.32` (MacBook, Chrome) had attempted ~15 failed POSTs and hit the rate limiter at 16:29 UTC — those were legitimate test attempts by the app owner + +--- + +*Add new findings below as they are discovered.* diff --git a/docs/insights/linkedin-cost-optimization.md b/docs/insights/linkedin-cost-optimization.md new file mode 100644 index 0000000..15093b3 --- /dev/null +++ b/docs/insights/linkedin-cost-optimization.md @@ -0,0 +1,29 @@ +My AWS bill was forecasting $81/month across multiple projects. I got it to $52 without removing a single feature. + +EC2, RDS, ALB, CloudFront, GitHub Actions CI/CD, multi-layer security. All of it is running to production standards. I audited the entire infrastructure and brought the bill down 36%. But the more interesting story is where the real savings came from. + +One change was the biggest single contributor. + +A Lambda function triggered by EventBridge shuts down EC2 and RDS every night. Weekends are fully off, back on weekdays at 8 AM. 93 hours of zero EC2/RDS runtime cost per week. Behind it: an SQS DLQ for failed invocations, a CloudWatch Alarm wired to SNS. Because a missed stop that goes unnoticed is just a delayed line item on the next bill, that single change accounted for ~22% of the total bill reduction. + +CloudWatch logs showed the EC2 averaging below 1% CPU. Downsized from t3.small to t3.micro after validating no performance regression. An idle EIP and an orphaned VPC were audited, confirmed unreferenced, and removed. + +The SPA on S3 and CloudFront instead of a VM. Four backend services on a single EC2 via Docker Compose instead of ECS/Fargate. No bastion host, no NAT Gateway, no ECR. Secrets in SSM Parameter Store Standard tier instead of Secrets Manager. CI/CD over OIDC. + +The rate-limiting architecture is a cost decision too, not just a security layer. + +Cloudflare absorbs abuse and bot traffic before it reaches AWS metered infrastructure. Every request stopped at the edge never triggers EC2, RDS, or data transfer charges. A public-facing URL shortener is a real target. Nginx takes it further. Known scanners get a 444, connection closed at the TCP level before a single byte is sent back. + +Two more changes are on the table: removing the ALB and migrating RDS to Dockerized PostgreSQL. Both save real money; both have trade-offs I'm still working through. If both land, the total reduction hits 64%. + +The most expensive decisions aren't the ones that show up on the bill. They're the ones that shaped the architecture before the bill existed. + +Next: the security architecture, Cloudflare over AWS WAF, and the four-layer rate limiting design. + +Production Link: https://tinyurl.buffden.com/ + +Source code: https://github.com/Buffden/tinyurl-api + +Previous post: https://www.linkedin.com/feed/update/urn:li:activity:7447693051248046080/ + +#AWS #CloudCost #SystemDesign #DevOps #SpringBoot #PostgreSQL #Cloudflare \ No newline at end of file diff --git a/docs/insights/linkedin-first-post.md b/docs/insights/linkedin-first-post.md new file mode 100644 index 0000000..1df3f51 --- /dev/null +++ b/docs/insights/linkedin-first-post.md @@ -0,0 +1,36 @@ +TinyURL — Production-Grade URL Shortener +https://lnkd.in/g9fGGqYv + +The goal wasn't to build a URL shortener. It was to treat a small system with the same rigour a production system deserves. + +Built and deployed a URL shortener on Amazon Web Services (AWS) using production-grade technologies, fully documented, with ADRs before any code was written. + +Request flow — six layers before the application code runs + +Cloudflare — The origin is never directly reachable. The EC2 security group accepts inbound traffic only from Cloudflare's published IP ranges. DDoS mitigation, bot protection, and WAF rate limiting are all absorbed at the edge before they become an AWS bill. + +CloudFront — Routes by path, forward to the ALB; everything else serves the Angular SPA from S3. Frontend and backend deploy independently. + +ALB — Terminates TLS, redirects HTTP to HTTPS, and runs health checks. It detects failure immediately and stops forwarding traffic. + +NGINX — Rate limiting across three independent zones with per-IP connection caps. Known vulnerability scanners are silently dropped at the TCP level. + +Spring Boot — Stateless, input validated at the boundary. HTTP semantics chosen deliberately: 301 for permanent links, 302 for expiring ones, 410 Gone for expired — not 404. Each signals a different intent to browsers, crawlers, and downstream clients. + +PostgreSQL — Two DB users by design. Flyway holds DDL rights for schema migrations. The application user has no ALTER or DROP privileges. A compromised dependency cannot touch the schema. + +Secrets in SSM, never in config or env vars. Dependencies verified via SHA-256 checksums. Docker images cosign-signed against GitHub OIDC. Rate limiting at three independent layers — Cloudflare WAF, Nginx, and the application. Full OWASP Secure Headers enforced at Nginx. + +CI/CD runs three gates: Testcontainers unit tests, a Docker Compose smoke test with ephemeral credentials, and then SSM deploy. All three must pass — smoke failure blocks deploy. + +The full ADR breakdown is on Medium — seven decisions, why each alternative was rejected, and what I'd change. https://lnkd.in/g4aYJwuS + +The following posts in this series will cover the AWS cost architecture, the security design in detail, and why Cloudflare was chosen over AWS WAF. If any of these are relevant to what you're building, follow along. + +v2 — Enhancement Plan + +Every v2 item has a specific trigger: Redis cache-aside when DB read-throughput is the bottleneck, distributed rate limiting when autoscaling makes per-process state insufficient, and Turnstile CAPTCHA when distributed bots bypass per-IP limits. CloudWatch anomaly alerting, soft delete, and custom aliases follow the same principle — each deferred until the constraint is measured, not assumed. The ADRs document why. + +- Source: https://lnkd.in/g5jcF3Uv + +#SystemDesign #AWS #Java #SpringBoot #Angular #Cloudflare #DevOps #PostgreSQL #OWASP \ No newline at end of file diff --git a/docs/insights/medium-cost-optimization.md b/docs/insights/medium-cost-optimization.md new file mode 100644 index 0000000..dd5b5ed --- /dev/null +++ b/docs/insights/medium-cost-optimization.md @@ -0,0 +1,211 @@ +--- +title: "Cutting My AWS Bill by 36% on a Portfolio Project — Every Decision I Made" +subtitle: "From $81/month to $52/month on a production-grade TinyURL service, and what's still on the table" +type: medium-post +date: 2026-04-22 +tags: [AWS, DevOps, Cloud Cost Optimization, Side Projects, Infrastructure] +--- + +# Cutting My AWS Bill by 36% on a Portfolio Project — Every Decision I Made + +My TinyURL side project is built to production standards — proper CI/CD, layered security, Cloudflare + CloudFront + ALB + Nginx in front, Spring Boot on EC2, PostgreSQL on RDS. It's a portfolio piece, but I run it like it's real. + +In March 2026, the first full month of the deployment, it cost $44.16. By April, projected to $81/month if left untouched. That's over $970/year for a side project. + +Here's the full audit of what I spent, every optimization I made, and the ones still in progress. + +--- + +## The Full Infrastructure Inventory + +Before cutting anything, I audited every line item: + +| Service | Resource | Cost/Month | +|---|---|---| +| EC2 | `ems-prod-app` (t2.micro) — 4 services in Docker | $8.35 | +| EC2 | `tinyurl-prod` (t3.small → t3.micro) | $14.98 → $7.49 | +| RDS | `tinyurl-prod` (db.t4g.micro, PostgreSQL 17.4, 20 GB gp3) | $13.70 | +| ALB | `tinyurl-alb` | $14.01 | +| Elastic IPs | 2x ALB EIPs + 1 idle (released) + EC2 EIPs | ~$11/month peak | +| Route 53 | `buffden.com` hosted zone | $0.54 | +| EBS | 8 GB gp3 + 20 GB gp2 | $1.50 | +| S3 + CloudFront | Static Angular SPA | Negligible | +| SSM Parameter Store | 37 params, all Standard tier | Free | +| CloudWatch Logs | 30-day retention | Minimal | + +**Baseline projection: ~$81/month** + +--- + +## The Architecture Decisions That Saved Money Before Day One + +The biggest cost wins weren't reactive — they were baked into the design. + +### 1. Static Angular SPA on S3 + CloudFront + +The frontend is a pure static build deployed to S3 and served through CloudFront. The alternative — running a Node server or serving assets from the Spring Boot host — would mean the EC2 instance handles every page load, increases coupling, and adds cost at scale. S3 + CloudFront charges are usage-based and at this traffic level, essentially zero. + +CloudFront handles routing too: `/api/*` and `/{shortCode}` go to the ALB; everything else serves `index.html` from S3 for SPA routing. 403/404 errors from S3 are remapped to `index.html` so browser-side routing works correctly. + +### 2. Cloudflare as the First Layer + +Before traffic ever reaches an AWS service, it hits Cloudflare. The DNS record for `tinyurl.buffden.com` is orange-cloud (proxied), meaning DNS resolves to Cloudflare's anycast edge — not CloudFront directly. + +The flow: `User → Cloudflare edge → CloudFront → S3 / ALB` + +This matters financially because DDoS and bot traffic gets absorbed by Cloudflare before it reaches AWS metered services. Cloudflare WAF, bot protection, and DDoS mitigation run at the edge. If a scraper hammers the redirect endpoint, Cloudflare eats it — not my ALB or EC2. + +### 3. Docker Compose on EC2 Instead of ECS/Fargate + +The `ems-prod-app` instance runs four services inside Docker Compose: Spring Boot, PostgreSQL, Redis, and Nginx. Total cost: $8.35/month on a t2.micro. + +ECS Fargate for the equivalent would be meaningfully more expensive, adds operational complexity, and is unnecessary at this scale. The precedent this set matters: TinyURL's own EC2 host follows the same model — Nginx + Spring Boot in Docker, using RDS for the database (for now). + +### 4. No NAT Gateway + +EC2 instances are in public subnets and reach the internet directly through the Internet Gateway. A NAT Gateway would add ~$32/month in baseline charges plus data processing fees. For instances that only need outbound access to pull container images and send CloudWatch logs, a NAT Gateway is over-engineered. RDS is in a private subnet (no internet access needed). + +### 5. SSM Session Manager Instead of a Bastion Host + +No SSH on port 22. No bastion EC2 instance. All shell access goes through AWS Systems Manager Session Manager. This means: +- No additional EC2 instance ($8–15/month saved) +- No security group rule opening port 22 to the internet +- All session activity logged to CloudWatch +- IAM-controlled access instead of SSH key management + +### 6. OIDC for CI/CD — No IAM Access Keys + +The GitHub Actions deploy pipeline authenticates to AWS using OIDC federation. GitHub's identity provider issues a short-lived token that AWS trusts; no long-lived access key is created or stored. Benefits: zero credential rotation overhead, no secret leakage risk, no IAM user to manage. + +### 7. GHCR for Container Images + +Container images are pushed to GitHub Container Registry (GHCR). For public repositories, GHCR is free with no egress charges for pulls. Using AWS ECR would add per-GB storage and data transfer charges — modest, but unnecessary. + +### 8. SSM Parameter Store Standard Tier for Secrets + +37 parameters (database credentials, app config, feature flags) are stored in SSM Parameter Store as Standard-tier SecureString entries encrypted with KMS. Standard tier is free for up to 10,000 parameters. AWS Secrets Manager would cost $0.40/secret/month — that adds up with multiple services. + +--- + +## The Three Reactive Optimizations (After the Bill Arrived) + +### Action 1: Release the Idle Elastic IP — Saves $3.65/month + +An EIP from a previous experiment was still allocated but not associated with any resource. AWS charges $3.65/month for any unassociated EIP. Released on 2026-04-02. + +This is the easiest win in any AWS cost audit: search for unassociated Elastic IPs and release them immediately. + +### Action 2: Downsize EC2 from t3.small → t3.micro — Saves $7.49/month + +The `tinyurl-prod` EC2 instance was originally provisioned as a t3.small at $14.98/month. After measuring actual utilization — memory at 45–57%, CPU rarely above 10% — there was no justification for the larger instance. Downsized to t3.micro ($7.49/month) on 2026-04-02. + +The EMS instance (`ems-prod-app`) couldn't be downsized — it's in availability zone us-east-1e, which only supports the t2 instance family, and the smallest available t2 is t2.micro (already in use). + +**The lesson:** Right-size based on measured utilization, not anticipated load. You can always scale up. You won't always remember to scale down. + +### Action 3: Night Scheduler — Saves ~$18/month + +This is the most impactful single change. The architecture: + +- **EventBridge** fires two cron rules: stop at 04:00 UTC (11 PM CDT), start at 12:00 UTC (7 AM CDT) weekdays +- **Lambda** executes EC2 `StopInstances` / `StartInstances` and RDS `StopDBInstance` / `StartDBInstance` +- **SQS Dead Letter Queue** captures any Lambda failures for retry/alerting +- **SNS** sends email notification on start/stop so I know when the environment is available +- **CloudWatch Alarm** monitors Lambda errors + +Total cost: $0/month. All components stay within AWS free tier. + +Uptime breakdown: +- Weekdays: ~11 hours on, ~13 hours off per day +- Weekends: fully off from Friday 11 PM to Monday 7 AM (60 hours) +- Effective uptime: ~48% of the week + +Savings: ~52% of EC2 + RDS costs that are time-based. On $34.68/month of stoppable compute and database charges, that's ~$18/month. + +**The caveat:** This works because it's a portfolio project. I don't need the service up at 2 AM. If this were serving real users, a different approach would be needed — autoscaling to zero on ECS Fargate or a similar demand-based shutdown strategy. + +--- + +## What's Still on the Table + +The optimizations above took the projected bill from $81 → $52/month (36% reduction). Two larger options are still being evaluated: + +### Option A: Remove the ALB + Release 2 EIPs — Saves $17.66/month + +The Application Load Balancer costs $14.01/month. Combined with the two EIPs attached to it ($7.30/month), removing it saves $21.31/month — minus one new EIP for the EC2 instance ($3.65/month) = **$17.66/month net savings**. + +The trade-off: TLS termination moves from the ALB to Nginx on EC2 directly. The EC2 security group needs rules updated to accept HTTPS traffic from CloudFront. ALB health checks are replaced by CloudFront origin health checks or a simple Route 53 health check. + +For a single-instance, single-region deployment with CloudFront in front, an ALB is over-specified. CloudFront handles the load distribution to the origin; the ALB's only real job here is TLS termination and health checking. + +Risk: Medium. The Nginx TLS configuration must be correct, and CloudFront → EC2 connectivity needs validation before cutting over. + +### Option B: Migrate RDS to Docker PostgreSQL on EC2 — Saves $13.70/month + +The TinyURL RDS instance (db.t4g.micro) costs $13.70/month. The `ems-prod-app` EC2 already runs PostgreSQL inside Docker Compose alongside Spring Boot and Redis. TinyURL's EC2 could do the same. + +Current `tinyurl-prod` memory utilization: 45–57%. Adding a Docker PostgreSQL container (similar to EMS) would push it to ~62–82% — comparable to how EMS runs today on the same instance size. + +The precedent is solid: EMS has run this configuration stably. The risk is the operational difference — RDS provides automated backups, Multi-AZ failover options, and managed minor version updates. Moving to Docker PostgreSQL means managing backups manually (or via pg_dump cron), no built-in HA, and manual version updates. + +For a side project where the data can be reconstructed and downtime is acceptable: the risk is manageable. For a production system with real user data: RDS earns its cost. + +**Combined potential: -$60.64/month from baseline (-64% total reduction)** + +--- + +## The Backup Window Bug I Found + +While analyzing the RDS setup, I found a conflict between the night scheduler and the RDS maintenance window. + +The automated backup window was set to 09:14–09:44 UTC. The night scheduler stops the RDS instance at 04:00 UTC and restarts it at 12:00 UTC weekdays, and the instance is completely off on weekends. + +AWS behavior: if a scheduled backup window falls while an RDS instance is stopped, AWS auto-restarts the instance to take the backup, then stops it again. On weekends, this was adding ~36 minutes of unexpected RDS runtime — at odd hours, with an SNS alert triggering unnecessarily. + +Fix: Move the backup window to 18:00–18:30 UTC (1:00–1:30 PM CDT) — squarely inside the active window. + +**The lesson:** Any time scheduler changes interact with managed service maintenance windows, audit them for conflicts. + +--- + +## Current State: April 2026 + +| Action | Savings/Month | Status | +|---|---|---| +| Release idle EIP | $3.65 | Done | +| Downsize EC2 t3.small → t3.micro | $7.49 | Done | +| Night scheduler | ~$18.00 | Done | +| **Total achieved** | **~$29/month (36%)** | | +| Remove ALB + 2 EIPs | $17.66 | Evaluating | +| Migrate RDS → Docker PostgreSQL | $13.70 | Evaluating | +| **Total potential** | **~$60/month (64%)** | | + +**Projected final bill if all options complete: ~$21/month** + +--- + +## What I'd Do Differently From the Start + +1. **Provision t3.micro from day one** — I over-provisioned out of habit. One month of metrics would have confirmed the smaller instance was fine. + +2. **Set the backup window explicitly during provisioning** — RDS default backup windows are assigned randomly. When you layer a scheduler on top, you need to own that window. + +3. **Audit EIPs immediately after decommissioning any resource** — Detach → release is one step. Detach alone isn't. + +4. **Night scheduler as a default for portfolio projects** — Not something to add after costs spike. Should be infrastructure-as-code from day one. + +The architecture itself I'd keep largely the same. Cloudflare in front, CloudFront for the SPA, Docker Compose on EC2, SSM for secrets and access — these all held up well. The costs were controllable from the start; I just didn't control them immediately. + +--- + +## Summary + +A production-grade portfolio project doesn't have to cost $80+/month. The combination of upfront architectural choices (CDN-hosted SPA, Cloudflare edge, no NAT gateway, SSM over bastion) and reactive optimizations (right-sizing, idle resource cleanup, scheduled downtime) get you most of the way there. + +The remaining decisions — ALB removal and RDS migration — are data migration and operational risk decisions, not cost knowledge gaps. The cost information was always available. The work is in the execution. + +Total time spent on the scheduler build + optimizations: roughly a weekend. Monthly return: $29/month and counting. + +--- + +*The project is live at [tinyurl.buffden.com](https://tinyurl.buffden.com). Architecture docs, ADRs, and the full system design are in the repo at [github.com/buffden/tinyurl](https://github.com/buffden/tinyurl).* diff --git a/docs/insights/notes.txt b/docs/insights/notes.txt new file mode 100644 index 0000000..841fe61 --- /dev/null +++ b/docs/insights/notes.txt @@ -0,0 +1,16 @@ +IoT +limitations +hardware constraints +open problems and challenges - 6 +5Vs +owasp +oem +data security 3 ways +protocols securities + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +formal model deiven analysis + +limitations +threat Models \ No newline at end of file