package main import ( "bufio" "bytes" "context" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "os" "strconv" "strings" "time" "github.com/minio/minio-go/v7" "github.com/minio/minio-go/v7/pkg/credentials" ) type Config struct { CouchURL string CouchDB string S3Endpoint string S3Bucket string S3UseSSL bool } func getEnv(key, def string) string { if v := os.Getenv(key); v != "" { return v } return def } func getEnvBool(key string, def bool) bool { if v := os.Getenv(key); v != "" { b, _ := strconv.ParseBool(v) return b } return def } type S3Client struct { client *minio.Client bucket string } func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) { // anonymous credentials → public bucket c, err := minio.New(endpoint, &minio.Options{ Creds: credentials.NewStaticV4("", "", ""), Secure: useSSL, }) if err != nil { return nil, fmt.Errorf("minio client creation failed: %w", err) } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() exists, err := c.BucketExists(ctx, bucket) if err != nil { return nil, fmt.Errorf("bucket check failed: %w", err) } if !exists { return nil, fmt.Errorf("bucket %s does not exist", bucket) } return &S3Client{client: c, bucket: bucket}, nil } // UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) { log.Printf("Downloading image: %s", imgURL) resp, err := http.Get(imgURL) if err != nil { return "", fmt.Errorf("HTTP GET failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("image download failed with status %d", resp.StatusCode) } data, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("reading response body failed: %w", err) } ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() _, err = s.client.PutObject(ctx, s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ ContentType: "image/webp", }) if err != nil { return "", fmt.Errorf("S3 upload failed: %w", err) } publicURL := fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key) log.Printf("Successfully uploaded image: %s", publicURL) return publicURL, nil } type House struct { ID string `json:"_id"` Rev string `json:"_rev,omitempty"` Source string `json:"source"` URL string `json:"url"` Status int `json:"status"` Type int `json:"type"` SubType int `json:"subType"` Images []string `json:"images"` Raw map[string]json.RawMessage `json:"raw,omitempty"` ScrapedAt time.Time `json:"scraped_at"` } type CouchClient struct { baseURL string database string client *http.Client } func NewCouchClient(base, db string) *CouchClient { return &CouchClient{ baseURL: base, database: db, client: &http.Client{Timeout: 30 * time.Second}, } } func (c *CouchClient) Upsert(h *House) error { body, err := json.Marshal(h) if err != nil { return fmt.Errorf("JSON marshal failed: %w", err) } reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, url.PathEscape(h.ID)) req, err := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) if err != nil { return fmt.Errorf("request creation failed: %w", err) } req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") resp, err := c.client.Do(req) if err != nil { return fmt.Errorf("HTTP request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) return fmt.Errorf("couchDB responded with status %d: %s", resp.StatusCode, string(body)) } var rev struct { Rev string `json:"rev"` } if err := json.NewDecoder(resp.Body).Decode(&rev); err != nil { return fmt.Errorf("decoding response failed: %w", err) } h.Rev = rev.Rev return nil } type OikotieScraper struct { client *http.Client s3 *S3Client baseURL string otaToken string otaCuid string otaLoaded string phpSessID string rateLimiter <-chan time.Time } func NewOikotieScraper(s3 *S3Client) *OikotieScraper { return &OikotieScraper{ client: &http.Client{Timeout: 30 * time.Second}, s3: s3, baseURL: "https://asunnot.oikotie.fi/api/search", rateLimiter: time.Tick(3 * time.Second), } } func (osi *OikotieScraper) loadTokens() error { osi.otaToken = getEnv("OTA_TOKEN", "") osi.otaCuid = getEnv("OTA_CUID", "") osi.otaLoaded = getEnv("OTA_LOADED", "") osi.phpSessID = getEnv("PHPSESSID", "") if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" { log.Println("Missing one or more tokens – please enter them now:") r := bufio.NewReader(os.Stdin) if osi.otaToken == "" { fmt.Print("OTA-token: ") token, err := r.ReadString('\n') if err != nil { return fmt.Errorf("reading OTA-token failed: %w", err) } osi.otaToken = strings.TrimSpace(token) } if osi.otaCuid == "" { fmt.Print("OTA-cuid: ") cuid, err := r.ReadString('\n') if err != nil { return fmt.Errorf("reading OTA-cuid failed: %w", err) } osi.otaCuid = strings.TrimSpace(cuid) } if osi.otaLoaded == "" { fmt.Print("OTA-loaded: ") loaded, err := r.ReadString('\n') if err != nil { return fmt.Errorf("reading OTA-loaded failed: %w", err) } osi.otaLoaded = strings.TrimSpace(loaded) } if osi.phpSessID == "" { fmt.Print("PHPSESSID: ") sessID, err := r.ReadString('\n') if err != nil { return fmt.Errorf("reading PHPSESSID failed: %w", err) } osi.phpSessID = strings.TrimSpace(sessID) } } return nil } func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error { if err := os.loadTokens(); err != nil { return fmt.Errorf("loading tokens failed: %w", err) } limit := 24 offset := 0 totalSaved := 0 retryCount := 0 maxRetries := 3 for { select { case <-ctx.Done(): log.Println("Context cancelled, stopping scrape") return ctx.Err() case <-os.rateLimiter: log.Printf("Fetching page with offset %d, limit %d", offset, limit) cards, found, err := os.fetchPage(offset, limit) if err != nil { if strings.Contains(err.Error(), "401") { log.Println("401 Unauthorized – re-entering tokens") if err := os.loadTokens(); err != nil { return fmt.Errorf("reloading tokens failed: %w", err) } retryCount++ if retryCount > maxRetries { return fmt.Errorf("max retries reached for authentication") } continue } log.Printf("Fetch error (offset %d): %v", offset, err) retryCount++ if retryCount > maxRetries { return fmt.Errorf("max retries reached for fetching") } time.Sleep(5 * time.Second) continue } retryCount = 0 // Reset retry count on successful fetch if len(cards) == 0 { log.Printf("No cards found at offset %d – finished scraping", offset) return nil } savedInBatch := 0 for i, c := range cards { log.Printf("Processing card %d/%d: %s", i+1, len(cards), c.ID) h, err := os.convertCard(c) if err != nil { log.Printf("Convert error for card %s: %v", c.ID, err) continue } if err := couch.Upsert(h); err != nil { log.Printf("CouchDB upsert failed for %s: %v", h.ID, err) } else { totalSaved++ savedInBatch++ } } log.Printf("Batch %d-%d: %d/%d cards saved (total: %d, found: %d)", offset, offset+len(cards)-1, savedInBatch, len(cards), totalSaved, found) if offset+len(cards) >= found { log.Printf("Reached end of results – %d cards saved in total", totalSaved) return nil } offset += limit } } } // Fixed: cardId can be number or string, so use json.Number type apiCard struct { ID json.Number `json:"cardId"` Type int `json:"cardType"` SubType int `json:"cardSubType"` URL string `json:"url"` Status int `json:"status"` Data json.RawMessage `json:"data"` Location json.RawMessage `json:"location"` Company json.RawMessage `json:"company"` Medias []struct { ImageMobileWebPx2 string `json:"imageMobileWebPx2"` } `json:"medias"` } func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { q := url.Values{} q.Add("locations", `[[64,6,"Helsinki"]]`) for _, bt := range []string{"4", "8", "32", "128", "64", "512"} { q.Add("buildingType[]", bt) } q.Add("cardType", "100") q.Add("limit", strconv.Itoa(limit)) q.Add("offset", strconv.Itoa(offset)) q.Add("sortBy", "published_sort_desc") reqURL := os.baseURL + "?" + q.Encode() req, err := http.NewRequest("GET", reqURL, nil) if err != nil { return nil, 0, fmt.Errorf("creating request failed: %w", err) } req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0") req.Header.Set("Accept", "application/json, text/plain, */*") req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512") req.Header.Set("OTA-token", os.otaToken) req.Header.Set("OTA-cuid", os.otaCuid) req.Header.Set("OTA-loaded", os.otaLoaded) req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid)) resp, err := os.client.Do(req) if err != nil { return nil, 0, fmt.Errorf("HTTP request failed: %w", err) } defer resp.Body.Close() if resp.StatusCode == 401 { return nil, 0, fmt.Errorf("401 Unauthorized") } if resp.StatusCode != 200 { body, _ := io.ReadAll(resp.Body) return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) } var payload struct { Found int `json:"found"` Cards []apiCard `json:"cards"` } // Read the body first for better error reporting body, err := io.ReadAll(resp.Body) if err != nil { return nil, 0, fmt.Errorf("reading response body failed: %w", err) } if err := json.Unmarshal(body, &payload); err != nil { log.Printf("Raw response: %s", string(body)) return nil, 0, fmt.Errorf("JSON unmarshal failed: %w", err) } return payload.Cards, payload.Found, nil } func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { // Convert json.Number to string for the ID cardID := c.ID.String() h := &House{ ID: "oikotie_" + cardID, Source: "oikotie", URL: c.URL, Status: c.Status, Type: c.Type, SubType: c.SubType, ScrapedAt: time.Now().UTC(), Raw: map[string]json.RawMessage{ "data": c.Data, "location": c.Location, "company": c.Company, }, } // Process images for i, m := range c.Medias { if m.ImageMobileWebPx2 == "" { continue } key := fmt.Sprintf("%s/img_%d.webp", h.ID, i) publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key) if err != nil { log.Printf("Image upload failed for %s: %v", key, err) continue } h.Images = append(h.Images, publicURL) } log.Printf("Successfully converted card %s with %d images", cardID, len(h.Images)) return h, nil } func main() { log.SetFlags(log.LstdFlags | log.Lshortfile) log.Println("Starting Oikotie scraper...") cfg := Config{ CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"), CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"), S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"), S3Bucket: getEnv("S3_BUCKET", "asunnot"), S3UseSSL: getEnvBool("S3_USE_SSL", true), } log.Printf("Configuration: CouchDB=%s, S3=%s/%s", cfg.CouchURL, cfg.S3Endpoint, cfg.S3Bucket) s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL) if err != nil { log.Fatalf("S3 initialization failed: %v", err) } log.Println("S3 client initialized successfully") couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB) log.Println("CouchDB client initialized successfully") scraper := NewOikotieScraper(s3) log.Println("Oikotie scraper initialized successfully") ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Handle graceful shutdown go func() { <-ctx.Done() log.Println("Shutting down...") }() log.Println("Starting full Oikotie scrape...") if err := scraper.ScrapeAll(ctx, couch); err != nil { log.Fatalf("Scrape failed: %v", err) } log.Println("Scraping completed successfully!") }