black homelan is gone; point install+publish+auth at the live cocotte ct-forge verdaccio (:4873) / forgejo (:3000). Config-only; resolution verified. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
501 lines
15 KiB
Markdown
501 lines
15 KiB
Markdown
# Deployment Guide
|
|
|
|
Deploy the analytics platform to production.
|
|
|
|
## Architecture Overview
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ Load Balancer │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
│
|
|
┌─────────────────────┼─────────────────────┐
|
|
│ │ │
|
|
▼ ▼ ▼
|
|
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
|
│ Collector │ │ Collector │ │ Collector │
|
|
│ Service │ │ Service │ │ Service │
|
|
└───────────────┘ └───────────────┘ └───────────────┘
|
|
│ │ │
|
|
└─────────────────────┼─────────────────────┘
|
|
│
|
|
▼
|
|
┌───────────────┐
|
|
│ Redis │
|
|
│ (BullMQ) │
|
|
└───────────────┘
|
|
│
|
|
┌─────────────────────┼─────────────────────┐
|
|
│ │ │
|
|
▼ ▼ ▼
|
|
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
|
|
│ Processor │ │ Processor │ │ Processor │
|
|
│ Worker │ │ Worker │ │ Worker │
|
|
└───────────────┘ └───────────────┘ └───────────────┘
|
|
│ │ │
|
|
└─────────────────────┼─────────────────────┘
|
|
│
|
|
▼
|
|
┌───────────────┐
|
|
│ PostgreSQL │
|
|
│ (TimescaleDB)│
|
|
└───────────────┘
|
|
│
|
|
▼
|
|
┌───────────────┐
|
|
│ API Service │
|
|
└───────────────┘
|
|
```
|
|
|
|
## Services
|
|
|
|
| Service | Port | Description |
|
|
|---------|------|-------------|
|
|
| Collector | 4001 | Event ingestion |
|
|
| Processor | - | Queue worker (no HTTP) |
|
|
| API | 4002 | Query endpoints |
|
|
| Realtime | 4003 | WebSocket server |
|
|
|
|
## Docker Deployment
|
|
|
|
### docker-compose.yml
|
|
|
|
```yaml
|
|
version: '3.8'
|
|
|
|
services:
|
|
collector:
|
|
image: analytics/collector:latest
|
|
ports:
|
|
- "4001:4001"
|
|
environment:
|
|
- NODE_ENV=production
|
|
- REDIS_URL=redis://redis:6379
|
|
- LOG_LEVEL=info
|
|
depends_on:
|
|
- redis
|
|
deploy:
|
|
replicas: 3
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.5'
|
|
|
|
processor:
|
|
image: analytics/processor:latest
|
|
environment:
|
|
- NODE_ENV=production
|
|
- REDIS_URL=redis://redis:6379
|
|
- DATABASE_URL=postgresql://postgres:password@postgres:5432/analytics
|
|
- CONCURRENCY=10
|
|
depends_on:
|
|
- redis
|
|
- postgres
|
|
deploy:
|
|
replicas: 2
|
|
resources:
|
|
limits:
|
|
memory: 1G
|
|
cpus: '1'
|
|
|
|
api:
|
|
image: analytics/api:latest
|
|
ports:
|
|
- "4002:4002"
|
|
environment:
|
|
- NODE_ENV=production
|
|
- DATABASE_URL=postgresql://postgres:password@postgres:5432/analytics
|
|
- REDIS_URL=redis://redis:6379
|
|
depends_on:
|
|
- postgres
|
|
- redis
|
|
deploy:
|
|
replicas: 2
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.5'
|
|
|
|
realtime:
|
|
image: analytics/realtime:latest
|
|
ports:
|
|
- "4003:4003"
|
|
environment:
|
|
- NODE_ENV=production
|
|
- REDIS_URL=redis://redis:6379
|
|
depends_on:
|
|
- redis
|
|
deploy:
|
|
replicas: 2
|
|
|
|
redis:
|
|
image: redis:7-alpine
|
|
volumes:
|
|
- redis_data:/data
|
|
command: redis-server --appendonly yes
|
|
|
|
postgres:
|
|
image: timescale/timescaledb:latest-pg15
|
|
environment:
|
|
- POSTGRES_DB=analytics
|
|
- POSTGRES_USER=postgres
|
|
- POSTGRES_PASSWORD=password
|
|
volumes:
|
|
- postgres_data:/var/lib/postgresql/data
|
|
|
|
volumes:
|
|
redis_data:
|
|
postgres_data:
|
|
```
|
|
|
|
## Kubernetes Deployment
|
|
|
|
### Collector Deployment
|
|
|
|
```yaml
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: analytics-collector
|
|
spec:
|
|
replicas: 3
|
|
selector:
|
|
matchLabels:
|
|
app: analytics-collector
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: analytics-collector
|
|
spec:
|
|
containers:
|
|
- name: collector
|
|
image: analytics/collector:latest
|
|
ports:
|
|
- containerPort: 4001
|
|
env:
|
|
- name: NODE_ENV
|
|
value: production
|
|
- name: REDIS_URL
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: analytics-secrets
|
|
key: redis-url
|
|
resources:
|
|
requests:
|
|
memory: "256Mi"
|
|
cpu: "250m"
|
|
limits:
|
|
memory: "512Mi"
|
|
cpu: "500m"
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: 4001
|
|
initialDelaySeconds: 5
|
|
periodSeconds: 10
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: 4001
|
|
initialDelaySeconds: 15
|
|
periodSeconds: 20
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: analytics-collector
|
|
spec:
|
|
selector:
|
|
app: analytics-collector
|
|
ports:
|
|
- port: 4001
|
|
targetPort: 4001
|
|
type: ClusterIP
|
|
---
|
|
apiVersion: autoscaling/v2
|
|
kind: HorizontalPodAutoscaler
|
|
metadata:
|
|
name: analytics-collector-hpa
|
|
spec:
|
|
scaleTargetRef:
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
name: analytics-collector
|
|
minReplicas: 3
|
|
maxReplicas: 10
|
|
metrics:
|
|
- type: Resource
|
|
resource:
|
|
name: cpu
|
|
target:
|
|
type: Utilization
|
|
averageUtilization: 70
|
|
```
|
|
|
|
## Environment Variables
|
|
|
|
### Collector Service
|
|
|
|
| Variable | Required | Default | Description |
|
|
|----------|----------|---------|-------------|
|
|
| `NODE_ENV` | Yes | - | Environment (production/development) |
|
|
| `PORT` | No | 4001 | HTTP port |
|
|
| `REDIS_URL` | Yes | - | Redis connection URL |
|
|
| `LOG_LEVEL` | No | info | Logging level |
|
|
| `CORS_ORIGINS` | No | * | Allowed CORS origins |
|
|
|
|
### Processor Service
|
|
|
|
| Variable | Required | Default | Description |
|
|
|----------|----------|---------|-------------|
|
|
| `NODE_ENV` | Yes | - | Environment |
|
|
| `REDIS_URL` | Yes | - | Redis connection URL |
|
|
| `DATABASE_URL` | Yes | - | PostgreSQL connection URL |
|
|
| `CONCURRENCY` | No | 5 | Worker concurrency |
|
|
| `BATCH_SIZE` | No | 100 | Events per batch |
|
|
|
|
### API Service
|
|
|
|
| Variable | Required | Default | Description |
|
|
|----------|----------|---------|-------------|
|
|
| `NODE_ENV` | Yes | - | Environment |
|
|
| `PORT` | No | 4002 | HTTP port |
|
|
| `DATABASE_URL` | Yes | - | PostgreSQL connection URL |
|
|
| `REDIS_URL` | Yes | - | Redis for caching |
|
|
| `API_KEYS` | Yes | - | Comma-separated API keys |
|
|
|
|
## Database Setup
|
|
|
|
### PostgreSQL with TimescaleDB
|
|
|
|
```sql
|
|
-- Create database
|
|
CREATE DATABASE analytics;
|
|
|
|
-- Enable TimescaleDB
|
|
CREATE EXTENSION IF NOT EXISTS timescaledb;
|
|
|
|
-- Create tables
|
|
CREATE TABLE raw_events (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
session_id VARCHAR(64) NOT NULL,
|
|
user_id VARCHAR(255),
|
|
event_type VARCHAR(100) NOT NULL,
|
|
event_action VARCHAR(255) NOT NULL,
|
|
metadata JSONB DEFAULT '{}',
|
|
timestamp TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Convert to hypertable for time-series optimization
|
|
SELECT create_hypertable('raw_events', 'timestamp');
|
|
|
|
-- Create indexes
|
|
CREATE INDEX idx_raw_events_session ON raw_events(session_id);
|
|
CREATE INDEX idx_raw_events_user ON raw_events(user_id);
|
|
CREATE INDEX idx_raw_events_type ON raw_events(event_type);
|
|
CREATE INDEX idx_raw_events_metadata ON raw_events USING GIN(metadata);
|
|
|
|
-- Aggregated tables
|
|
CREATE TABLE daily_metrics (
|
|
date DATE NOT NULL,
|
|
metric_name VARCHAR(100) NOT NULL,
|
|
dimension_key VARCHAR(255),
|
|
dimension_value VARCHAR(255),
|
|
value BIGINT NOT NULL DEFAULT 0,
|
|
PRIMARY KEY (date, metric_name, dimension_key, dimension_value)
|
|
);
|
|
|
|
-- Retention policy: keep raw events for 90 days
|
|
SELECT add_retention_policy('raw_events', INTERVAL '90 days');
|
|
```
|
|
|
|
### Schema Management & Drift
|
|
|
|
Production runs TypeORM with **`synchronize: false`** (auto-sync risks destructive
|
|
changes) and **no migration runner**. Two consequences that have caused real outages:
|
|
|
|
1. **The processor's `SchemaGuardService` is the schema authority** for DDL the entity
|
|
decorators can't express, or that a long-lived / freshly-provisioned database might
|
|
lack. It runs on processor startup (`onModuleInit`) and idempotently ensures critical
|
|
objects — e.g. the `aggregated_metrics` `NULLS NOT DISTINCT` dedup index and the
|
|
`session_fingerprints` enrichment columns.
|
|
|
|
2. **Adding a `@Column` to an entity does NOT add it to a long-lived prod table.** With
|
|
`synchronize` off, the column exists in code but not in the database, so every INSERT
|
|
referencing it throws `column "…" does not exist`. If that write path swallows errors
|
|
(e.g. `upsertSessionFingerprint` treats fingerprinting as best-effort), the failure is
|
|
**silent**: the canonical table (`raw_events`) keeps filling while the derived table
|
|
(`session_fingerprints`) freezes.
|
|
|
|
**Symptom:** dashboard pages backed by the derived table — Traffic, Audience, Network,
|
|
which read `session_fingerprints` — show `0` / "no data", while raw-event-backed pages
|
|
(Overview, Pages, Events) look fine. The API returns a successful empty `[]`, so it
|
|
reads as a quiet period, not an error.
|
|
|
|
**Rule: when you add an entity column prod must have, add it to `SchemaGuardService` too.**
|
|
|
|
```ts
|
|
// services/processor/src/schema-guard.service.ts → onModuleInit()
|
|
await this.dataSource.query(`
|
|
ALTER TABLE IF EXISTS session_fingerprints
|
|
ADD COLUMN IF NOT EXISTS "newField" varchar(30)
|
|
`);
|
|
```
|
|
|
|
Additive `ADD COLUMN IF NOT EXISTS` is safe on every startup. To unblock a running prod
|
|
DB immediately (no redeploy — the running service's next INSERT succeeds once the column
|
|
exists):
|
|
|
|
```sql
|
|
ALTER TABLE session_fingerprints ADD COLUMN IF NOT EXISTS "newField" varchar(30);
|
|
```
|
|
|
|
> Incident history (same class both times): 2026-05-16→06-07 — missing `aggregated_metrics`
|
|
> dedup index, every aggregation failing for three weeks. 2026-06-21 — missing
|
|
> `session_fingerprints` gov/ASN columns (`isGovernment`, `orgType`, `responseTier`,
|
|
> `org`, `asn`), every fingerprint INSERT failing, Traffic/Audience/Network blank.
|
|
|
|
## Nginx Configuration
|
|
|
|
```nginx
|
|
upstream collector {
|
|
least_conn;
|
|
server collector-1:4001;
|
|
server collector-2:4001;
|
|
server collector-3:4001;
|
|
}
|
|
|
|
upstream api {
|
|
server api-1:4002;
|
|
server api-2:4002;
|
|
}
|
|
|
|
upstream realtime {
|
|
ip_hash; # Sticky sessions for WebSocket
|
|
server realtime-1:4003;
|
|
server realtime-2:4003;
|
|
}
|
|
|
|
server {
|
|
listen 443 ssl http2;
|
|
server_name analytics.example.com;
|
|
|
|
ssl_certificate /etc/ssl/certs/analytics.crt;
|
|
ssl_certificate_key /etc/ssl/private/analytics.key;
|
|
|
|
# Collector - high throughput
|
|
location /collect {
|
|
proxy_pass http://collector;
|
|
proxy_http_version 1.1;
|
|
proxy_set_header Host $host;
|
|
proxy_set_header X-Real-IP $remote_addr;
|
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|
|
|
# Don't buffer - fast response
|
|
proxy_buffering off;
|
|
|
|
# Allow large batches
|
|
client_max_body_size 1m;
|
|
}
|
|
|
|
# API - standard REST
|
|
location /api {
|
|
proxy_pass http://api;
|
|
proxy_http_version 1.1;
|
|
proxy_set_header Host $host;
|
|
proxy_set_header X-Real-IP $remote_addr;
|
|
|
|
# Cache GET requests
|
|
proxy_cache api_cache;
|
|
proxy_cache_valid 200 1m;
|
|
proxy_cache_key "$request_method$request_uri";
|
|
add_header X-Cache-Status $upstream_cache_status;
|
|
}
|
|
|
|
# WebSocket - realtime
|
|
location /realtime {
|
|
proxy_pass http://realtime;
|
|
proxy_http_version 1.1;
|
|
proxy_set_header Upgrade $http_upgrade;
|
|
proxy_set_header Connection "upgrade";
|
|
proxy_set_header Host $host;
|
|
|
|
# Long-lived connections
|
|
proxy_read_timeout 86400s;
|
|
proxy_send_timeout 86400s;
|
|
}
|
|
}
|
|
```
|
|
|
|
## Monitoring
|
|
|
|
### Health Checks
|
|
|
|
All services expose `/health` endpoint:
|
|
|
|
```json
|
|
{
|
|
"status": "healthy",
|
|
"version": "1.0.0",
|
|
"uptime": 86400,
|
|
"checks": {
|
|
"redis": "ok",
|
|
"database": "ok"
|
|
}
|
|
}
|
|
```
|
|
|
|
### Metrics (Prometheus)
|
|
|
|
Services expose `/metrics` endpoint:
|
|
|
|
```
|
|
# Collector metrics
|
|
analytics_events_received_total{type="engagement"} 1234567
|
|
analytics_events_queued_total 1234500
|
|
analytics_batch_size_histogram_bucket{le="10"} 50000
|
|
|
|
# Processor metrics
|
|
analytics_events_processed_total 1234000
|
|
analytics_processing_duration_seconds_bucket{le="0.1"} 1200000
|
|
analytics_queue_depth 500
|
|
|
|
# API metrics
|
|
analytics_api_requests_total{endpoint="/trends",status="200"} 50000
|
|
analytics_api_latency_seconds_bucket{le="0.5"} 49000
|
|
```
|
|
|
|
### Grafana Dashboards
|
|
|
|
Import pre-built dashboards from `/dashboards/`:
|
|
- `collector-metrics.json` - Ingestion throughput
|
|
- `processor-metrics.json` - Processing performance
|
|
- `api-metrics.json` - Query latency and errors
|
|
- `business-metrics.json` - Analytics KPIs
|
|
|
|
## Scaling Guidelines
|
|
|
|
### Collector Service
|
|
|
|
- Scale horizontally based on incoming event rate
|
|
- Target: <100ms p99 response time
|
|
- Rule of thumb: 1 replica per 10,000 events/minute
|
|
|
|
### Processor Service
|
|
|
|
- Scale based on queue depth
|
|
- Target: Queue depth < 1000
|
|
- Increase `CONCURRENCY` before adding replicas
|
|
|
|
### API Service
|
|
|
|
- Scale based on query latency
|
|
- Target: <500ms p95 for complex queries
|
|
- Add read replicas to PostgreSQL for heavy read load
|
|
|
|
### Database
|
|
|
|
- Use TimescaleDB compression for historical data
|
|
- Partition by month for large deployments
|
|
- Consider ClickHouse for >1B events/day
|