From b067b6bc9ceda761e3e89192a96fe57f32d20946 Mon Sep 17 00:00:00 2001 From: zukwiz Date: Wed, 27 May 2026 19:07:30 +0200 Subject: [PATCH 1/3] cmd/seed: synthetic dataset for local development MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a deterministic ~6-month synthetic dataset (~9.8k rows, gentle sinusoid with occasional spikes and quiet days) for exercising the dashboard locally without needing real production exports. The generator deliberately spans every period (7d / 30d / 3m / 6m / 1y) so the chart UI has data to render at any range. Safety properties: - Refuses to run unless Config.Environment == "development". - INSERT … ON CONFLICT (id) DO NOTHING, so re-running is a no-op. - Steam IDs use a clearly-synthetic 76561198000000000 prefix. - Snowflake IDs encode the same created_at + sequence layout as the production generator, so synthetic rows sort chronologically alongside any real rows already in the DB. internal-docs/ and internal/devseed/fixtures/ are added to .gitignore to keep author scratch space and any future local CSV fixtures out of the public repo. Co-authored-by: Cursor --- .gitignore | 3 + README.md | 15 +++ cmd/seed/main.go | 61 ++++++++++++ internal/devseed/synthetic.go | 179 ++++++++++++++++++++++++++++++++++ 4 files changed, 258 insertions(+) create mode 100644 cmd/seed/main.go create mode 100644 internal/devseed/synthetic.go diff --git a/.gitignore b/.gitignore index 595dfa4..4e1454c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ static/*.db-wal .secrets .env + +internal-docs/ +internal/devseed/fixtures/ diff --git a/README.md b/README.md index 1926b7f..3805cbb 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,21 @@ If you're looking to participate by contributing reversal reports (i.e. marketpl The server starts on port `80` by default (configurable via `HTTP_PORT`). +### Seeding local data + +Production ingests live data from contributing marketplaces. For local development, a deterministic synthetic dataset (~6 months, ~9.8k rows with realistic daily variance) can be loaded with: + +```bash +go run ./cmd/seed +``` + +The seed: + +- Refuses to run unless `Environment` is `development`. +- Uses `INSERT … ON CONFLICT (id) DO NOTHING`, so it's safe to re-run. +- Generates a deterministic 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). +- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. + ## Configuration Configuration is loaded from environment variables or a `config.json` file. diff --git a/cmd/seed/main.go b/cmd/seed/main.go new file mode 100644 index 0000000..e683514 --- /dev/null +++ b/cmd/seed/main.go @@ -0,0 +1,61 @@ +// Command seed loads a deterministic synthetic dataset into the local +// Reverse Watch Postgres databases for local dashboard development. +// It is NEVER intended to run in production and refuses to run unless +// Config.Environment is "development". +// +// go run ./cmd/seed +// +// The insert uses ON CONFLICT (id) DO NOTHING, so re-running is safe. +package main + +import ( + "fmt" + "os" + "time" + + "reverse-watch/config" + "reverse-watch/domain/models" + "reverse-watch/domain/models/constants" + "reverse-watch/internal/devseed" + "reverse-watch/logging" + "reverse-watch/repository/factory" + "reverse-watch/secret" +) + +func main() { + logging.Initialize() + cfg := config.Load() + + if cfg.Environment != constants.EnvironmentDevelopment { + fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment) + os.Exit(1) + } + + // Required by factory bootstrap (e.g. admin API key seeding). The + // synthetic generator pre-populates its own IDs, so the snowflake + // generator does not actually run for them. + models.InitSnowflakeGenerator(0, 0) + + keygen := secret.NewKeyGenerator(cfg.Environment) + f, err := factory.NewFactory(cfg, keygen) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err) + os.Exit(1) + } + defer func() { + if err := f.Close(); err != nil { + fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err) + } + }() + + reversals := devseed.GenerateSynthetic(time.Now().UTC()) + fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals)) + + inserted, err := devseed.InsertReversals(f.PublicDB(), reversals) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err) + os.Exit(1) + } + skipped := int64(len(reversals)) - inserted + fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped) +} diff --git a/internal/devseed/synthetic.go b/internal/devseed/synthetic.go new file mode 100644 index 0000000..8b93957 --- /dev/null +++ b/internal/devseed/synthetic.go @@ -0,0 +1,179 @@ +// Package devseed loads dev-only fixture data into the local Postgres +// instance. It is intentionally not wired into the main binary — call it +// from cmd/seed (or a test) when you need realistic data locally. +package devseed + +import ( + "math" + "math/rand" + "time" + + "reverse-watch/domain/models" + + "gorm.io/gorm" + "gorm.io/gorm/clause" +) + +const ( + syntheticRNGSeed int64 = 42 + syntheticDays = 180 + syntheticTargetTotal = 9800 + syntheticBaseSteamID uint64 = 76561198000000000 + syntheticBaseReporter uint = 2_900_000 +) + +var syntheticMarketplaces = []struct { + slug string + weight float64 +}{ + {"csfloat", 0.80}, + {"tradeit", 0.10}, + {"skinport", 0.05}, + {"swap.gg", 0.05}, +} + +// GenerateSynthetic returns a deterministic ~6-month dataset (~9,800 rows, +// at least one per day, gentle sinusoid with occasional spikes / quiet +// days). Snowflake IDs are unique within the slice and won't collide with +// real CSV-seeded IDs, so callers can pipe the result straight into +// InsertReversals. +func GenerateSynthetic(now time.Time) []*models.Reversal { + rng := rand.New(rand.NewSource(syntheticRNGSeed)) + nowMs := uint64(now.UnixMilli()) + today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) + + counts := make([]int, syntheticDays) + for d := 0; d < syntheticDays; d++ { + base := 40.0 + 10.0*math.Sin(float64(d)/30.0) + var mult float64 + switch r := rng.Float64(); { + case r < 0.05: + mult = 2.5 + rng.Float64()*2.5 + case r < 0.15: + mult = 0.2 + rng.Float64()*0.3 + default: + mult = 0.7 + rng.Float64()*0.6 + } + counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5)) + } + + total := 0 + for _, c := range counts { + total += c + } + if total > 0 { + scale := float64(syntheticTargetTotal) / float64(total) + for d := range counts { + counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale))) + } + } + + rows := make([]*models.Reversal, 0, syntheticTargetTotal+200) + var steamOffset uint64 = 1 + var seq uint16 + + for d := 0; d < syntheticDays; d++ { + dayStart := today.AddDate(0, 0, -(syntheticDays-1-d)) + for i := 0; i < counts[d]; i++ { + reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour))) + if uint64(reversedAt.UnixMilli()) > nowMs { + reversedAt = now.Add(-1 * time.Minute) + } + reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute + createdAt := reversedAt.Add(reportDelay) + if uint64(createdAt.UnixMilli()) > nowMs { + createdAt = now + } + + srcRoll := rng.Float64() + var src models.Source + var related *models.SteamID + switch { + case srcRoll < 0.90: + src = models.SourceDirect + case srcRoll < 0.95: + src = models.SourceRelatedUser + relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97) + related = &relID + default: + src = models.SourceUserReport + } + + var expunged *uint64 + if rng.Float64() < 0.015 { + eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute) + if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) { + ems := uint64(eAt.UnixMilli()) + expunged = &ems + } + } + + steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50))) + steamOffset++ + + // Snowflake encodes created_at + a 12-bit per-ms sequence; + // mirrors domain/models/snowflake.go so generated IDs sort + // chronologically alongside production rows. + seq = (seq + 1) & 0x0FFF + sfTs := uint64(createdAt.UnixMilli()) - models.Epoch + sf := models.Snowflake((sfTs << 22) | uint64(seq)) + + reporter := syntheticBaseReporter + uint(steamOffset) + mp := pickMarketplace(rng) + + rows = append(rows, &models.Reversal{ + Model: models.Model{ + ID: sf, + CreatedAt: uint64(createdAt.UnixMilli()), + UpdatedAt: uint64(createdAt.UnixMilli()), + }, + SteamID: steamID, + MarketplaceSlug: mp, + Source: &src, + RelatedSteamID: related, + ReversedAt: uint64(reversedAt.UnixMilli()), + ReporterInternalID: &reporter, + ExpungedAt: expunged, + }) + } + } + return rows +} + +func pickMarketplace(rng *rand.Rand) string { + r := rng.Float64() + cum := 0.0 + for _, mp := range syntheticMarketplaces { + cum += mp.weight + if r < cum { + return mp.slug + } + } + return syntheticMarketplaces[0].slug +} + +// insertChunkSize keeps each bulk insert under Postgres's 65,535 +// parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows +// uses ~11k parameters. +const insertChunkSize = 1000 + +// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING, +// so the seed is idempotent. Returns the number of rows actually inserted. +func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { + if len(reversals) == 0 { + return 0, nil + } + var inserted int64 + for i := 0; i < len(reversals); i += insertChunkSize { + end := i + insertChunkSize + if end > len(reversals) { + end = len(reversals) + } + res := db.Clauses(clause.OnConflict{DoNothing: true}).Create(reversals[i:end]) + if res.Error != nil { + return inserted, res.Error + } + inserted += res.RowsAffected + } + return inserted, nil +} From c4d85230759e70eda09e6da48b91cf9d90b11e26 Mon Sep 17 00:00:00 2001 From: zukwiz Date: Wed, 24 Jun 2026 13:32:24 +0200 Subject: [PATCH 2/3] devseed: address seed review feedback - Give syntheticTargetTotal an explicit int type and reduce the seed to ~2,000 rows (a couple thousand) for faster local seeding. - Normalize the incoming now to UTC before deriving any dates so the generated series is timezone-independent. - Simplify the README seeding guard to "must be run in a development environment", move the seeding section to the end of the README, and stop claiming idempotency (reruns add more data since IDs derive from wall-clock time). Co-authored-by: Cursor --- README.md | 32 ++++++++++++++++---------------- internal/devseed/synthetic.go | 18 ++++++++++-------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 3805cbb..5a4ceed 100644 --- a/README.md +++ b/README.md @@ -20,21 +20,6 @@ If you're looking to participate by contributing reversal reports (i.e. marketpl The server starts on port `80` by default (configurable via `HTTP_PORT`). -### Seeding local data - -Production ingests live data from contributing marketplaces. For local development, a deterministic synthetic dataset (~6 months, ~9.8k rows with realistic daily variance) can be loaded with: - -```bash -go run ./cmd/seed -``` - -The seed: - -- Refuses to run unless `Environment` is `development`. -- Uses `INSERT … ON CONFLICT (id) DO NOTHING`, so it's safe to re-run. -- Generates a deterministic 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). -- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. - ## Configuration Configuration is loaded from environment variables or a `config.json` file. @@ -62,4 +47,19 @@ API keys are scoped to an entity and carry a permission bitfield. Keys are prefi ## Rate Limiting -Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers. \ No newline at end of file +Rate limits are enforced in-memory per process. Throttled responses return `429 Too Many Requests` with `X-RateLimit-*` and `Retry-After` headers. + +## Seeding local data + +Production ingests live data from contributing marketplaces. For local development, a synthetic dataset (~6 months, ~2k rows with realistic daily variance) can be loaded with: + +```bash +go run ./cmd/seed +``` + +The seed: + +- Must be run in a development environment. +- Generates a 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). +- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. +- Re-running the seed inserts additional rows rather than being a no-op, since IDs are derived from wall-clock time. \ No newline at end of file diff --git a/internal/devseed/synthetic.go b/internal/devseed/synthetic.go index 8b93957..d70b99d 100644 --- a/internal/devseed/synthetic.go +++ b/internal/devseed/synthetic.go @@ -17,7 +17,7 @@ import ( const ( syntheticRNGSeed int64 = 42 syntheticDays = 180 - syntheticTargetTotal = 9800 + syntheticTargetTotal int = 2000 syntheticBaseSteamID uint64 = 76561198000000000 syntheticBaseReporter uint = 2_900_000 ) @@ -32,12 +32,12 @@ var syntheticMarketplaces = []struct { {"swap.gg", 0.05}, } -// GenerateSynthetic returns a deterministic ~6-month dataset (~9,800 rows, -// at least one per day, gentle sinusoid with occasional spikes / quiet -// days). Snowflake IDs are unique within the slice and won't collide with -// real CSV-seeded IDs, so callers can pipe the result straight into -// InsertReversals. +// GenerateSynthetic returns a ~6-month dataset (~2,000 rows, at least one +// per day, gentle sinusoid with occasional spikes / quiet days). Snowflake +// IDs are unique within the slice and won't collide with real CSV-seeded +// IDs, so callers can pipe the result straight into InsertReversals. func GenerateSynthetic(now time.Time) []*models.Reversal { + now = now.UTC() rng := rand.New(rand.NewSource(syntheticRNGSeed)) nowMs := uint64(now.UnixMilli()) today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) @@ -157,8 +157,10 @@ func pickMarketplace(rng *rand.Rand) string { // uses ~11k parameters. const insertChunkSize = 1000 -// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING, -// so the seed is idempotent. Returns the number of rows actually inserted. +// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING. +// Snowflake IDs are derived from wall-clock time, so re-running the seed +// produces new IDs and inserts additional rows rather than being a no-op. +// Returns the number of rows actually inserted. func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { if len(reversals) == 0 { return 0, nil From fb6920a8c80d6395cd976a9f1182b51b73233318 Mon Sep 17 00:00:00 2001 From: zukwiz Date: Wed, 24 Jun 2026 15:10:03 +0200 Subject: [PATCH 3/3] devseed: make seed reruns idempotent and guard snowflake epoch Re-running the seed on a different day produces new snowflake IDs but the same deterministic (steam_id, marketplace_slug) pairs, which collide with the partial unique index idx_reversals_steam_id_marketplace_slug. Target that natural key (with the WHERE deleted_at IS NULL predicate) in the ON CONFLICT DO NOTHING clause so reruns skip existing rows instead of raising a unique-constraint error, and correct the README/doc comments to describe the idempotent behavior. Also clamp the synthetic createdAt to models.Epoch before the unsigned snowflake timestamp subtraction so an out-of-range clock can't underflow into a garbage ID. --- README.md | 2 +- cmd/seed/main.go | 3 +- internal/devseed/synthetic.go | 27 ++++++++++--- internal/devseed/synthetic_test.go | 65 ++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 internal/devseed/synthetic_test.go diff --git a/README.md b/README.md index 5a4ceed..1994586 100644 --- a/README.md +++ b/README.md @@ -62,4 +62,4 @@ The seed: - Must be run in a development environment. - Generates a 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). - Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. -- Re-running the seed inserts additional rows rather than being a no-op, since IDs are derived from wall-clock time. \ No newline at end of file +- Is safe to re-run: rows are de-duplicated on `(steam_id, marketplace_slug)` via `ON CONFLICT DO NOTHING`, so a rerun skips rows that already exist instead of raising a unique-constraint error. \ No newline at end of file diff --git a/cmd/seed/main.go b/cmd/seed/main.go index e683514..1921e77 100644 --- a/cmd/seed/main.go +++ b/cmd/seed/main.go @@ -5,7 +5,8 @@ // // go run ./cmd/seed // -// The insert uses ON CONFLICT (id) DO NOTHING, so re-running is safe. +// The insert uses ON CONFLICT (steam_id, marketplace_slug) DO NOTHING, so +// re-running is safe: rows that already exist are skipped rather than erroring. package main import ( diff --git a/internal/devseed/synthetic.go b/internal/devseed/synthetic.go index d70b99d..6cdc5ff 100644 --- a/internal/devseed/synthetic.go +++ b/internal/devseed/synthetic.go @@ -115,7 +115,15 @@ func GenerateSynthetic(now time.Time) []*models.Reversal { // mirrors domain/models/snowflake.go so generated IDs sort // chronologically alongside production rows. seq = (seq + 1) & 0x0FFF - sfTs := uint64(createdAt.UnixMilli()) - models.Epoch + // createdAt is always well after models.Epoch for the ~6-month + // synthetic window, but clamp defensively so an out-of-range + // createdAt can't underflow the unsigned subtraction into a + // garbage snowflake (mirrors the guard in genSnowflakeWithParts). + createdAtMs := uint64(createdAt.UnixMilli()) + if createdAtMs < models.Epoch { + createdAtMs = models.Epoch + } + sfTs := createdAtMs - models.Epoch sf := models.Snowflake((sfTs << 22) | uint64(seq)) reporter := syntheticBaseReporter + uint(steamOffset) @@ -157,21 +165,30 @@ func pickMarketplace(rng *rand.Rand) string { // uses ~11k parameters. const insertChunkSize = 1000 -// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING. -// Snowflake IDs are derived from wall-clock time, so re-running the seed -// produces new IDs and inserts additional rows rather than being a no-op. +// InsertReversals bulk-inserts reversals, skipping any row whose +// (steam_id, marketplace_slug) already exists via ON CONFLICT DO NOTHING. +// That pair is deterministic across runs (it does not depend on wall-clock +// time), so re-running the seed is safe and idempotent: already-present rows +// are skipped instead of raising a unique-constraint error. The conflict +// target matches the partial unique index created in repository/public +// (idx_reversals_steam_id_marketplace_slug ... WHERE deleted_at IS NULL). // Returns the number of rows actually inserted. func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { if len(reversals) == 0 { return 0, nil } + onConflict := clause.OnConflict{ + Columns: []clause.Column{{Name: "steam_id"}, {Name: "marketplace_slug"}}, + TargetWhere: clause.Where{Exprs: []clause.Expression{clause.Expr{SQL: "deleted_at IS NULL"}}}, + DoNothing: true, + } var inserted int64 for i := 0; i < len(reversals); i += insertChunkSize { end := i + insertChunkSize if end > len(reversals) { end = len(reversals) } - res := db.Clauses(clause.OnConflict{DoNothing: true}).Create(reversals[i:end]) + res := db.Clauses(onConflict).Create(reversals[i:end]) if res.Error != nil { return inserted, res.Error } diff --git a/internal/devseed/synthetic_test.go b/internal/devseed/synthetic_test.go new file mode 100644 index 0000000..5eecc02 --- /dev/null +++ b/internal/devseed/synthetic_test.go @@ -0,0 +1,65 @@ +package devseed + +import ( + "testing" + "time" + + "reverse-watch/domain/models" + "reverse-watch/internal/testutil" + "reverse-watch/repository/public" +) + +// TestInsertReversals_RerunIsSafe guards Finding 1: re-running the seed on a +// different calendar day produces new snowflake IDs but the same deterministic +// (steam_id, marketplace_slug) pairs, which collide with the partial unique +// index. The insert must skip those rows via ON CONFLICT DO NOTHING rather +// than raising a unique-constraint error. +func TestInsertReversals_RerunIsSafe(t *testing.T) { + db := testutil.NewTestDB(t) + if err := public.CreateIndexes(db); err != nil { + t.Fatalf("CreateIndexes: %v", err) + } + + // Both runs use past timestamps (BeforeCreate rejects future + // reversed_at), but on different calendar days so snowflake IDs differ. + now := time.Now().UTC().AddDate(0, 0, -10) + first := GenerateSynthetic(now) + if _, err := InsertReversals(db, first); err != nil { + t.Fatalf("first insert: %v", err) + } + + later := now.AddDate(0, 0, 5) + second := GenerateSynthetic(later) + n2, err := InsertReversals(db, second) + if err != nil { + t.Fatalf("rerun errored, expected ON CONFLICT DO NOTHING to skip: %v", err) + } + if n2 != 0 { + t.Errorf("rerun inserted %d rows, want 0 (idempotent)", n2) + } + + if first[0].ID == second[0].ID { + t.Errorf("expected differing snowflake IDs across days, got %d for both", first[0].ID) + } +} + +// TestGenerateSynthetic_NoEpochUnderflow guards Finding 2: generating data +// with a clock that predates models.Epoch must not underflow the unsigned +// snowflake timestamp subtraction. Every decoded timestamp must be >= Epoch. +func TestGenerateSynthetic_NoEpochUnderflow(t *testing.T) { + // A clock one day before Epoch makes every generated createdAt precede + // Epoch, so the guard must clamp each snowflake timestamp to exactly + // Epoch. Without the guard the unsigned subtraction wraps to a huge + // value, decoding to a timestamp far beyond Epoch. + beforeEpoch := time.UnixMilli(int64(models.Epoch)).UTC().AddDate(0, 0, -1) + rows := GenerateSynthetic(beforeEpoch) + if len(rows) == 0 { + t.Fatal("expected synthetic rows") + } + for _, r := range rows { + ts := models.ParseSnowflake(r.ID).Timestamp + if ts != models.Epoch { + t.Fatalf("snowflake timestamp %d != Epoch %d (underflow not guarded)", ts, models.Epoch) + } + } +}