From 1897d2b64db7e6089330fd18ca1569e5df642447 Mon Sep 17 00:00:00 2001 From: herbygitea Date: Tue, 24 Mar 2026 20:25:28 +0000 Subject: [PATCH] Create pbs-youtube-analytics.md via n8n --- PBS/Tech/Projects/pbs-youtube-analytics.md | 350 +++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 PBS/Tech/Projects/pbs-youtube-analytics.md diff --git a/PBS/Tech/Projects/pbs-youtube-analytics.md b/PBS/Tech/Projects/pbs-youtube-analytics.md new file mode 100644 index 0000000..6260293 --- /dev/null +++ b/PBS/Tech/Projects/pbs-youtube-analytics.md @@ -0,0 +1,350 @@ +--- +project: pbs-youtube-analytics +type: project-plan +status: active +tags: + - pbs + - youtube + - python + - automation + - n8n + - flask + - streamlit + - analytics +created: 2026-03-23 +updated: 2026-03-23 +path: PBS/Tech/Projects/ +--- + +# PBS YouTube Analytics Pipeline + +## Project Goal +Build a self-hosted YouTube analytics pipeline for the PBS channel that +collects video performance data (with a focus on audience retention), +stores it in SQLite, automates collection via n8n, sends alerts to Google +Chat, and visualizes insights through a Streamlit dashboard. + +## Why This Matters +YouTube Studio's built-in analytics are limited and don't let us slice data +the way we need. By owning the raw data, Travis can do proper analysis in +Python/R, and Jenny gets a clean dashboard showing what's actually working +in our content — especially where viewers drop off or rewatch. + +--- + +## Architecture Overview + +``` +YouTube Analytics API + | + Python Collector Script (PyCharm + UV) + | + SQLite Database (self-contained file) + | + ┌────┴────┐ + │ │ + n8n Streamlit +(schedule (dashboard ++ alerts) via Traefik) +``` + +- **Data Collection:** Python script using `google-api-python-client` + +`google-auth-oauthlib` +- **Storage:** SQLite database file (lightweight, portable, perfect for +read-heavy analytics) +- **Automation:** n8n triggers collection on schedule, sends Google Chat +alerts +- **Visualization:** Streamlit app served as Docker container behind Traefik + +--- + +## Phase 1: Google Cloud + API Setup +**Estimated Time:** 1-2 hours +**Goal:** Get API credentials and verify access to PBS YouTube data + +### Tasks +- [ ] Create Google Cloud project (or use existing PBS project) +- [ ] Enable YouTube Data API v3 +- [ ] Enable YouTube Analytics API v2 +- [ ] Configure OAuth consent screen (Internal if using Workspace, External +otherwise) +- [ ] Create OAuth 2.0 Desktop App credentials +- [ ] Download `client_secret.json` +- [ ] Test OAuth flow — authorize and confirm access to PBS channel data + +### Key Details +- **Required OAuth scope:** ` +https://www.googleapis.com/auth/yt-analytics.readonly` +- **Additional scope for video metadata:** ` +https://www.googleapis.com/auth/youtube.readonly` +- OAuth tokens will be stored securely and refreshed automatically +- First auth requires browser interaction; subsequent runs use refresh token + +### Deliverable +Working OAuth credentials that can query the PBS channel's analytics data + +--- + +## Phase 2: Python Data Collector +**Estimated Time:** 3-4 hours +**Goal:** Python script that pulls video stats and retention data into +SQLite +**Tools:** PyCharm Professional, UV package manager + +### Tasks +- [ ] Initialize project with UV (`uv init pbs-youtube-analytics`) +- [ ] Install dependencies: `google-api-python-client`, +`google-auth-oauthlib`, `google-auth-httplib2` +- [ ] Build OAuth2 auth module with token persistence (refresh token stored +in JSON) +- [ ] Build video list collector (pulls all PBS videos/shorts with metadata) +- [ ] Build retention data collector (audience retention curves per video) +- [ ] Build general metrics collector (views, watch time, likes, traffic +sources, etc.) +- [ ] Design and create SQLite schema +- [ ] Implement data ingestion with upsert logic (idempotent runs) +- [ ] Add CLI interface for manual runs and backfill +- [ ] Test with real PBS channel data + +### SQLite Schema (Initial Design) + +```sql +-- Video metadata from Data API +CREATE TABLE videos ( + video_id TEXT PRIMARY KEY, + title TEXT NOT NULL, + published_at TEXT NOT NULL, + duration_seconds INTEGER, + video_type TEXT CHECK(video_type IN ('video', 'short')), + thumbnail_url TEXT, + description TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP +); + +-- Daily aggregate metrics from Analytics API +CREATE TABLE video_daily_metrics ( + video_id TEXT NOT NULL, + date TEXT NOT NULL, + views INTEGER DEFAULT 0, + estimated_minutes_watched REAL DEFAULT 0, + average_view_duration_seconds REAL DEFAULT 0, + average_view_percentage REAL DEFAULT 0, + likes INTEGER DEFAULT 0, + dislikes INTEGER DEFAULT 0, + comments INTEGER DEFAULT 0, + shares INTEGER DEFAULT 0, + subscribers_gained INTEGER DEFAULT 0, + subscribers_lost INTEGER DEFAULT 0, + impressions INTEGER DEFAULT 0, + impressions_ctr REAL DEFAULT 0, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (video_id, date), + FOREIGN KEY (video_id) REFERENCES videos(video_id) +); + +-- Audience retention curve (100 data points per video) +CREATE TABLE video_retention ( + video_id TEXT NOT NULL, + elapsed_ratio REAL NOT NULL, + audience_watch_ratio REAL NOT NULL, + relative_retention_performance REAL, + fetched_at TEXT DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (video_id, elapsed_ratio), + FOREIGN KEY (video_id) REFERENCES videos(video_id) +); + +-- Traffic source breakdown per video per day +CREATE TABLE video_traffic_sources ( + video_id TEXT NOT NULL, + date TEXT NOT NULL, + traffic_source TEXT NOT NULL, + views INTEGER DEFAULT 0, + estimated_minutes_watched REAL DEFAULT 0, + PRIMARY KEY (video_id, date, traffic_source), + FOREIGN KEY (video_id) REFERENCES videos(video_id) +); + +-- Channel-level daily summary +CREATE TABLE channel_daily_metrics ( + date TEXT PRIMARY KEY, + total_views INTEGER DEFAULT 0, + total_estimated_minutes_watched REAL DEFAULT 0, + subscribers_gained INTEGER DEFAULT 0, + subscribers_lost INTEGER DEFAULT 0, + net_subscribers INTEGER DEFAULT 0, + created_at TEXT DEFAULT CURRENT_TIMESTAMP +); +``` + +### API Details — Retention Data +- **Endpoint:** YouTube Analytics API v2 `reports.query` +- **Dimension:** `elapsedVideoTimeRatio` (100 data points, values 0.01 to +1.0) +- **Metrics available:** + - `audienceWatchRatio` — absolute retention (can exceed 1.0 for rewatched +segments) + - `relativeRetentionPerformance` — compared to similar-length YouTube +videos (0 to 1 scale) + - `startedWatching` — how often viewers started watching at this point + - `stoppedWatching` — how often viewers stopped watching at this point +- **Limitation:** Retention data is per-video only (one video per API call, +no further dimension splits) +- **Note:** For a 60-second Short, each data point ≈ 0.6 seconds. For a +10-minute video, each ≈ 6 seconds. + +### Deliverable +Python CLI tool that pulls all PBS video data + retention curves into a +local SQLite database + +--- + +## Phase 3: n8n Automation + Alerts +**Estimated Time:** 2-3 hours +**Goal:** Automate daily data collection and send performance alerts to +Google Chat + +### Tasks +- [ ] Deploy collector script to Linode server (alongside n8n) +- [ ] Create n8n workflow: daily scheduled trigger → Execute Command node → +runs Python collector +- [ ] Add error handling: notify Google Chat on collection failures +- [ ] Create weekly digest alert: top performing videos, notable retention +patterns +- [ ] Create threshold alerts: video crosses view milestones, unusual +engagement spikes +- [ ] Test scheduled execution end-to-end + +### Alert Ideas +- **Weekly Digest (for Jenny):** Top 5 videos this week by views, best +retention video, shorts vs long-form comparison +- **Spike Alert:** Video gets 2x+ its average daily views +- **Milestone Alert:** Video crosses 1K, 5K, 10K views +- **New Video Check-in:** 48-hour performance report for newly published +content + +### Deliverable +Automated daily collection with Google Chat alerts for notable events + +--- + +## Phase 4: Streamlit Dashboard +**Estimated Time:** 4-6 hours +**Goal:** Interactive web dashboard for Jenny and Travis to explore PBS +YouTube performance + +### Tasks +- [ ] Initialize Streamlit project with UV +- [ ] Build retention heatmap view (the star feature) +- [ ] Build video comparison view (side-by-side retention curves) +- [ ] Build channel overview page (trends over time) +- [ ] Build shorts vs long-form comparison view +- [ ] Build traffic source analysis view +- [ ] Dockerize Streamlit app +- [ ] Add to docker-compose with Traefik labels +- [ ] Deploy to staging first, then production +- [ ] Secure with Authelia (when SSO rollout happens) or basic auth +initially + +### Dashboard Pages (Initial Concept) +1. **Channel Overview** — subscriber trend, total views/watch time over +time, publishing cadence +2. **Video Deep Dive** — select a video, see retention curve, daily +metrics, traffic sources +3. **Retention Heatmap** — all videos on one view, color-coded by retention +quality at each time segment +4. **Shorts Lab** — Shorts-specific view comparing hook effectiveness +(first 3 seconds), rewatch rates +5. **What's Working** — auto-surfaced insights: best retention patterns, +top traffic sources, optimal video length + +### Deployment +- Streamlit container behind Traefik at `analytics.plantbasedsoutherner.com` +(or similar subdomain) +- Reads from same SQLite file populated by the collector +- Protected by basic auth initially, Authelia later + +### Deliverable +Live dashboard accessible to Jenny and Travis showing PBS YouTube +performance with retention analysis + +--- + +## Phase 5: Advanced Analysis & Iteration +**Estimated Time:** Ongoing +**Goal:** Leverage the data for deeper content strategy insights + +### Future Ideas +- [ ] Correlate retention patterns with recipe categories (link to +`pbs_recipes` table) +- [ ] A/B analysis: compare thumbnail styles, intro approaches, video +lengths +- [ ] Optimal posting time analysis using traffic source timing data +- [ ] Export data to R for statistical modeling +- [ ] Instagram vs YouTube cross-platform performance comparison +- [ ] Automated content recommendations based on what's performing + +--- + +## Prerequisites & Dependencies + +| Requirement | Status | Notes | +|---|---|---| +| Google Cloud project | Needed | May already exist for Google Workspace | +| YouTube Analytics API enabled | Needed | Free, quota-based | +| OAuth 2.0 credentials | Needed | Desktop app type | +| Python + UV | Ready | Travis's local dev setup | +| Linode server access | Ready | Same server running n8n | +| n8n operational | Ready | Already running PBS automation | +| Traefik reverse proxy | Ready | For Streamlit subdomain | +| SQLite | Ready | Ships with Python, no setup needed | + +--- + +## API Quotas & Limits +- YouTube Analytics API: 200 queries/day default (can request increase) +- YouTube Data API v3: 10,000 units/day (listing videos costs ~1-3 units +each) +- Retention data: one video per API call (plan batch collection accordingly) +- Data availability: typically 2-3 day delay from YouTube + +--- + +## Key Decisions + +| Decision | Choice | Rationale | +|---|---|---| +| Database | SQLite | Self-contained, portable, perfect for read-heavy +analytics workload. No server process needed. | +| Dashboard | Streamlit | Python-native, fast to build, interactive. Travis +can leverage data analyst skills directly. | +| API approach | YouTube Analytics API (targeted queries) | Real-time, +flexible dimensions/metrics. Better than Reporting API for our scale. | +| Hosting | Linode (same server) | Keeps everything centralized with +existing PBS infrastructure. | + +--- + +## Sequencing & Priority +1. **Phase 1** (API Setup) → unblocks everything +2. **Phase 2** (Python Collector) → gets data flowing, enables ad-hoc +analysis immediately +3. **Phase 3** (n8n Automation) → removes manual collection burden +4. **Phase 4** (Streamlit Dashboard) → gives Jenny self-service access to +insights +5. **Phase 5** (Advanced Analysis) → ongoing value extraction + +--- + +## Relationship to Other PBS Projects +- **PBS Content Hub (Phase 5):** Dashboard could eventually be a tab within +the Content Hub +- **Authelia SSO:** Will protect the Streamlit dashboard once rolled out +- **WordPress-to-MySQL sync:** Could correlate website recipe traffic with +YouTube performance +- **Instagram automation:** Cross-platform analysis potential (YouTube + +Instagram data in one place) + +--- + +*Next Step: Phase 1 — Set up Google Cloud project and enable YouTube APIs* \ No newline at end of file