package main import ( "bufio" "bytes" "context" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "os" "strconv" "strings" "time" "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" ) // --------------------------------------------------------------------- // 1. CONFIG & HELPERS // --------------------------------------------------------------------- type Config struct { CouchURL string CouchDB string S3Endpoint string S3Bucket string S3UseSSL bool } func getEnv(key, def string) string { if v := os.Getenv(key); v != "" { return v } return def } func getEnvBool(key string, def bool) bool { if v := os.Getenv(key); v != "" { b, _ := strconv.ParseBool(v) return b } return def } // --------------------------------------------------------------------- // 2. S3 / MINIO CLIENT (public bucket – no keys) // --------------------------------------------------------------------- type S3Client struct { client *minio.Client bucket string } func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) { // anonymous credentials → public bucket c, err := minio.New(endpoint, &minio.Options{ Creds: credentials.NewStaticV4("", "", ""), Secure: useSSL, }) if err != nil { return nil, err } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() exists, err := c.BucketExists(ctx, bucket) if err != nil || !exists { return nil, fmt.Errorf("bucket %s not accessible", bucket) } return &S3Client{client: c, bucket: bucket}, nil } // UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) { resp, err := http.Get(imgURL) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("img status %d", resp.StatusCode) } data, err := io.ReadAll(resp.Body) if err != nil { return "", err } _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ ContentType: "image/webp", }) if err != nil { return "", err } return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil } // --------------------------------------------------------------------- // 3. HOUSE MODEL // --------------------------------------------------------------------- type House struct { ID string `json:"_id"` Rev string `json:"_rev,omitempty"` Source string `json:"source"` URL string `json:"url"` Status int `json:"status"` Type int `json:"type"` SubType int `json:"subType"` Images []string `json:"images"` Raw map[string]json.RawMessage `json:"raw,omitempty"` ScrapedAt time.Time `json:"scraped_at"` } // --------------------------------------------------------------------- // 4. COUCHDB CLIENT (simplified – only Upsert) // --------------------------------------------------------------------- type CouchClient struct { baseURL string database string client *http.Client } func NewCouchClient(base, db string) *CouchClient { return &CouchClient{ baseURL: base, database: db, client: &http.Client{Timeout: 30 * time.Second}, } } func (c *CouchClient) Upsert(h *House) error { body, _ := json.Marshal(h) reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID) req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") resp, err := c.client.Do(req) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { b, _ := io.ReadAll(resp.Body) return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b)) } var rev struct { Rev string `json:"rev"` } json.NewDecoder(resp.Body).Decode(&rev) h.Rev = rev.Rev return nil } // --------------------------------------------------------------------- // 5. OIKOTIE SCRAPER (single struct – everything in main.go) // --------------------------------------------------------------------- type OikotieScraper struct { client *http.Client s3 *S3Client baseURL string otaToken string otaCuid string otaLoaded string phpSessID string rateLimiter <-chan time.Time } func NewOikotieScraper(s3 *S3Client) *OikotieScraper { return &OikotieScraper{ client: &http.Client{Timeout: 30 * time.Second}, s3: s3, baseURL: "https://asunnot.oikotie.fi/api/search", rateLimiter: time.Tick(3 * time.Second), } } // ---- token handling ------------------------------------------------- func (osi *OikotieScraper) loadTokens() { osi.otaToken = getEnv("OTA_TOKEN", "") osi.otaCuid = getEnv("OTA_CUID", "") osi.otaLoaded = getEnv("OTA_LOADED", "") osi.phpSessID = getEnv("PHPSESSID", "") if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" { log.Println("Missing one or more tokens – please enter them now:") r := bufio.NewReader(os.Stdin) if osi.otaToken == "" { fmt.Print("OTA-token: ") osi.otaToken, _ = r.ReadString('\n') osi.otaToken = strings.TrimSpace(osi.otaToken) } if osi.otaCuid == "" { fmt.Print("OTA-cuid: ") osi.otaCuid, _ = r.ReadString('\n') osi.otaCuid = strings.TrimSpace(osi.otaCuid) } if osi.otaLoaded == "" { fmt.Print("OTA-loaded: ") osi.otaLoaded, _ = r.ReadString('\n') osi.otaLoaded = strings.TrimSpace(osi.otaLoaded) } if osi.phpSessID == "" { fmt.Print("PHPSESSID: ") osi.phpSessID, _ = r.ReadString('\n') osi.phpSessID = strings.TrimSpace(osi.phpSessID) } } } // ---- main scrape loop ----------------------------------------------- func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error { os.loadTokens() limit := 24 offset := 0 totalSaved := 0 for { select { case <-ctx.Done(): return ctx.Err() case <-os.rateLimiter: cards, found, err := os.fetchPage(offset, limit) if err != nil { if strings.Contains(err.Error(), "401") { log.Println("401 – re-entering tokens") os.loadTokens() continue } log.Printf("fetch error (offset %d): %v", offset, err) time.Sleep(5 * time.Second) continue } if len(cards) == 0 { log.Printf("No cards at offset %d – finished", offset) break } for _, c := range cards { h, err := os.convertCard(c) if err != nil { log.Printf("convert error %s: %v", c.ID, err) continue } if err := couch.Upsert(h); err != nil { log.Printf("couch upsert %s: %v", h.ID, err) } else { totalSaved++ } } log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found) if offset+len(cards) >= found { log.Printf("Reached end – %d cards saved", totalSaved) break } offset += limit } } return nil } // ---- API call -------------------------------------------------------- type apiCard struct { ID string `json:"cardId"` Type int `json:"cardType"` SubType int `json:"cardSubType"` URL string `json:"url"` Status int `json:"status"` Data json.RawMessage Location json.RawMessage Company json.RawMessage Medias []struct { ImageMobileWebPx2 string `json:"imageMobileWebPx2"` } `json:"medias"` } func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { q := url.Values{} q.Add("locations", `[[64,6,"Helsinki"]]`) for _, bt := range []string{"4", "8", "32", "128", "64", "512"} { q.Add("buildingType[]", bt) } q.Add("cardType", "100") q.Add("limit", strconv.Itoa(limit)) q.Add("offset", strconv.Itoa(offset)) q.Add("sortBy", "published_sort_desc") reqURL := os.baseURL + "?" + q.Encode() req, _ := http.NewRequest("GET", reqURL, nil) // ---- headers ---------------------------------------------------- req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0") req.Header.Set("Accept", "application/json, text/plain, */*") req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512") req.Header.Set("OTA-token", os.otaToken) req.Header.Set("OTA-cuid", os.otaCuid) req.Header.Set("OTA-loaded", os.otaLoaded) req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid)) resp, err := os.client.Do(req) if err != nil { return nil, 0, err } defer resp.Body.Close() if resp.StatusCode == 401 { return nil, 0, fmt.Errorf("401 Unauthorized") } if resp.StatusCode != 200 { b, _ := io.ReadAll(resp.Body) return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b)) } var payload struct { Found int `json:"found"` Cards []apiCard `json:"cards"` } if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { return nil, 0, err } return payload.Cards, payload.Found, nil } // ---- conversion ------------------------------------------------------- func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { h := &House{ ID: "oikotie_" + c.ID, Source: "oikotie", URL: "https://asunnot.oikotie.fi" + c.URL, Status: c.Status, Type: c.Type, SubType: c.SubType, ScrapedAt: time.Now(), Raw: map[string]json.RawMessage{ "data": c.Data, "location": c.Location, "company": c.Company, }, } // ---- images → download → S3 → store public URL -------------------- for i, m := range c.Medias { if m.ImageMobileWebPx2 == "" { continue } key := fmt.Sprintf("%s/img_%d.webp", h.ID, i) publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key) if err != nil { log.Printf("image upload failed %s: %v", key, err) continue } h.Images = append(h.Images, publicURL) } return h, nil } // --------------------------------------------------------------------- // 6. MAIN // --------------------------------------------------------------------- func main() { cfg := Config{ CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"), CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"), S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"), S3Bucket: getEnv("S3_BUCKET", "asunnot"), S3UseSSL: getEnvBool("S3_USE_SSL", true), } s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL) if err != nil { log.Fatal("S3 init:", err) } couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB) scraper := NewOikotieScraper(s3) ctx, cancel := context.WithCancel(context.Background()) defer cancel() log.Println("Starting full Oikotie scrape …") if err := scraper.ScrapeAll(ctx, couch); err != nil { log.Fatal("scrape failed:", err) } log.Println("All done!") }