From d41ac3c094f733a8038885de3400ed7558b2b878 Mon Sep 17 00:00:00 2001 From: Petri Hienonen Date: Fri, 14 Nov 2025 11:47:49 +0200 Subject: Minor tuning --- scrape/main.go | 248 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 157 insertions(+), 91 deletions(-) (limited to 'scrape/main.go') diff --git a/scrape/main.go b/scrape/main.go index 7ef5ce4..fed0397 100644 --- a/scrape/main.go +++ b/scrape/main.go @@ -19,10 +19,6 @@ import ( "github.com/minio/minio-go/v7/pkg/credentials" ) -// --------------------------------------------------------------------- -// 1. CONFIG & HELPERS -// --------------------------------------------------------------------- - type Config struct { CouchURL string CouchDB string @@ -46,10 +42,6 @@ func getEnvBool(key string, def bool) bool { return def } -// --------------------------------------------------------------------- -// 2. S3 / MINIO CLIENT (public bucket – no keys) -// --------------------------------------------------------------------- - type S3Client struct { client *minio.Client bucket string @@ -62,44 +54,53 @@ func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) { Secure: useSSL, }) if err != nil { - return nil, err + return nil, fmt.Errorf("minio client creation failed: %w", err) } ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() exists, err := c.BucketExists(ctx, bucket) - if err != nil || !exists { - return nil, fmt.Errorf("bucket %s not accessible", bucket) + if err != nil { + return nil, fmt.Errorf("bucket check failed: %w", err) + } + if !exists { + return nil, fmt.Errorf("bucket %s does not exist", bucket) } return &S3Client{client: c, bucket: bucket}, nil } // UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) { + log.Printf("Downloading image: %s", imgURL) resp, err := http.Get(imgURL) if err != nil { - return "", err + return "", fmt.Errorf("HTTP GET failed: %w", err) } defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { - return "", fmt.Errorf("img status %d", resp.StatusCode) + return "", fmt.Errorf("image download failed with status %d", resp.StatusCode) } + data, err := io.ReadAll(resp.Body) if err != nil { - return "", err + return "", fmt.Errorf("reading response body failed: %w", err) } - _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + _, err = s.client.PutObject(ctx, s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ ContentType: "image/webp", }) if err != nil { - return "", err + return "", fmt.Errorf("S3 upload failed: %w", err) } - return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil + + publicURL := fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key) + log.Printf("Successfully uploaded image: %s", publicURL) + return publicURL, nil } -// --------------------------------------------------------------------- -// 3. HOUSE MODEL -// --------------------------------------------------------------------- - type House struct { ID string `json:"_id"` Rev string `json:"_rev,omitempty"` @@ -113,10 +114,6 @@ type House struct { ScrapedAt time.Time `json:"scraped_at"` } -// --------------------------------------------------------------------- -// 4. COUCHDB CLIENT (simplified – only Upsert) -// --------------------------------------------------------------------- - type CouchClient struct { baseURL string database string @@ -132,33 +129,41 @@ func NewCouchClient(base, db string) *CouchClient { } func (c *CouchClient) Upsert(h *House) error { - body, _ := json.Marshal(h) - reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID) - req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) + body, err := json.Marshal(h) + if err != nil { + return fmt.Errorf("JSON marshal failed: %w", err) + } + + reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, url.PathEscape(h.ID)) + req, err := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("request creation failed: %w", err) + } + req.Header.Set("Content-Type", "application/json") req.Header.Set("Accept", "application/json") resp, err := c.client.Do(req) if err != nil { - return err + return fmt.Errorf("HTTP request failed: %w", err) } defer resp.Body.Close() + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { - b, _ := io.ReadAll(resp.Body) - return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b)) + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("couchDB responded with status %d: %s", resp.StatusCode, string(body)) } + var rev struct { Rev string `json:"rev"` } - json.NewDecoder(resp.Body).Decode(&rev) + if err := json.NewDecoder(resp.Body).Decode(&rev); err != nil { + return fmt.Errorf("decoding response failed: %w", err) + } h.Rev = rev.Rev return nil } -// --------------------------------------------------------------------- -// 5. OIKOTIE SCRAPER (single struct – everything in main.go) -// --------------------------------------------------------------------- - type OikotieScraper struct { client *http.Client s3 *S3Client @@ -179,8 +184,7 @@ func NewOikotieScraper(s3 *S3Client) *OikotieScraper { } } -// ---- token handling ------------------------------------------------- -func (osi *OikotieScraper) loadTokens() { +func (osi *OikotieScraper) loadTokens() error { osi.otaToken = getEnv("OTA_TOKEN", "") osi.otaCuid = getEnv("OTA_CUID", "") osi.otaLoaded = getEnv("OTA_LOADED", "") @@ -189,94 +193,132 @@ func (osi *OikotieScraper) loadTokens() { if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" { log.Println("Missing one or more tokens – please enter them now:") r := bufio.NewReader(os.Stdin) + if osi.otaToken == "" { fmt.Print("OTA-token: ") - osi.otaToken, _ = r.ReadString('\n') - osi.otaToken = strings.TrimSpace(osi.otaToken) + token, err := r.ReadString('\n') + if err != nil { + return fmt.Errorf("reading OTA-token failed: %w", err) + } + osi.otaToken = strings.TrimSpace(token) } + if osi.otaCuid == "" { fmt.Print("OTA-cuid: ") - osi.otaCuid, _ = r.ReadString('\n') - osi.otaCuid = strings.TrimSpace(osi.otaCuid) + cuid, err := r.ReadString('\n') + if err != nil { + return fmt.Errorf("reading OTA-cuid failed: %w", err) + } + osi.otaCuid = strings.TrimSpace(cuid) } + if osi.otaLoaded == "" { fmt.Print("OTA-loaded: ") - osi.otaLoaded, _ = r.ReadString('\n') - osi.otaLoaded = strings.TrimSpace(osi.otaLoaded) + loaded, err := r.ReadString('\n') + if err != nil { + return fmt.Errorf("reading OTA-loaded failed: %w", err) + } + osi.otaLoaded = strings.TrimSpace(loaded) } + if osi.phpSessID == "" { fmt.Print("PHPSESSID: ") - osi.phpSessID, _ = r.ReadString('\n') - osi.phpSessID = strings.TrimSpace(osi.phpSessID) + sessID, err := r.ReadString('\n') + if err != nil { + return fmt.Errorf("reading PHPSESSID failed: %w", err) + } + osi.phpSessID = strings.TrimSpace(sessID) } } + return nil } -// ---- main scrape loop ----------------------------------------------- func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error { - os.loadTokens() + if err := os.loadTokens(); err != nil { + return fmt.Errorf("loading tokens failed: %w", err) + } limit := 24 offset := 0 totalSaved := 0 + retryCount := 0 + maxRetries := 3 for { select { case <-ctx.Done(): + log.Println("Context cancelled, stopping scrape") return ctx.Err() case <-os.rateLimiter: + log.Printf("Fetching page with offset %d, limit %d", offset, limit) cards, found, err := os.fetchPage(offset, limit) if err != nil { if strings.Contains(err.Error(), "401") { - log.Println("401 – re-entering tokens") - os.loadTokens() + log.Println("401 Unauthorized – re-entering tokens") + if err := os.loadTokens(); err != nil { + return fmt.Errorf("reloading tokens failed: %w", err) + } + retryCount++ + if retryCount > maxRetries { + return fmt.Errorf("max retries reached for authentication") + } continue } - log.Printf("fetch error (offset %d): %v", offset, err) + + log.Printf("Fetch error (offset %d): %v", offset, err) + retryCount++ + if retryCount > maxRetries { + return fmt.Errorf("max retries reached for fetching") + } time.Sleep(5 * time.Second) continue } + + retryCount = 0 // Reset retry count on successful fetch if len(cards) == 0 { - log.Printf("No cards at offset %d – finished", offset) - break + log.Printf("No cards found at offset %d – finished scraping", offset) + return nil } - for _, c := range cards { + savedInBatch := 0 + for i, c := range cards { + log.Printf("Processing card %d/%d: %s", i+1, len(cards), c.ID) h, err := os.convertCard(c) if err != nil { - log.Printf("convert error %s: %v", c.ID, err) + log.Printf("Convert error for card %s: %v", c.ID, err) continue } if err := couch.Upsert(h); err != nil { - log.Printf("couch upsert %s: %v", h.ID, err) + log.Printf("CouchDB upsert failed for %s: %v", h.ID, err) } else { totalSaved++ + savedInBatch++ } } - log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found) + log.Printf("Batch %d-%d: %d/%d cards saved (total: %d, found: %d)", + offset, offset+len(cards)-1, savedInBatch, len(cards), totalSaved, found) if offset+len(cards) >= found { - log.Printf("Reached end – %d cards saved", totalSaved) - break + log.Printf("Reached end of results – %d cards saved in total", totalSaved) + return nil } offset += limit } } - return nil } -// ---- API call -------------------------------------------------------- +// Fixed: cardId can be number or string, so use json.Number type apiCard struct { - ID string `json:"cardId"` - Type int `json:"cardType"` - SubType int `json:"cardSubType"` - URL string `json:"url"` - Status int `json:"status"` - Data json.RawMessage - Location json.RawMessage - Company json.RawMessage + ID json.Number `json:"cardId"` + Type int `json:"cardType"` + SubType int `json:"cardSubType"` + URL string `json:"url"` + Status int `json:"status"` + Data json.RawMessage `json:"data"` + Location json.RawMessage `json:"location"` + Company json.RawMessage `json:"company"` Medias []struct { ImageMobileWebPx2 string `json:"imageMobileWebPx2"` } `json:"medias"` @@ -294,9 +336,11 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { q.Add("sortBy", "published_sort_desc") reqURL := os.baseURL + "?" + q.Encode() - req, _ := http.NewRequest("GET", reqURL, nil) + req, err := http.NewRequest("GET", reqURL, nil) + if err != nil { + return nil, 0, fmt.Errorf("creating request failed: %w", err) + } - // ---- headers ---------------------------------------------------- req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0") req.Header.Set("Accept", "application/json, text/plain, */*") req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512") @@ -307,7 +351,7 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { resp, err := os.client.Do(req) if err != nil { - return nil, 0, err + return nil, 0, fmt.Errorf("HTTP request failed: %w", err) } defer resp.Body.Close() @@ -315,30 +359,40 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { return nil, 0, fmt.Errorf("401 Unauthorized") } if resp.StatusCode != 200 { - b, _ := io.ReadAll(resp.Body) - return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b)) + body, _ := io.ReadAll(resp.Body) + return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) } var payload struct { Found int `json:"found"` Cards []apiCard `json:"cards"` } - if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { - return nil, 0, err + + // Read the body first for better error reporting + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, 0, fmt.Errorf("reading response body failed: %w", err) + } + + if err := json.Unmarshal(body, &payload); err != nil { + log.Printf("Raw response: %s", string(body)) + return nil, 0, fmt.Errorf("JSON unmarshal failed: %w", err) } return payload.Cards, payload.Found, nil } -// ---- conversion ------------------------------------------------------- func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { + // Convert json.Number to string for the ID + cardID := c.ID.String() + h := &House{ - ID: "oikotie_" + c.ID, + ID: "oikotie_" + cardID, Source: "oikotie", - URL: "https://asunnot.oikotie.fi" + c.URL, + URL: c.URL, Status: c.Status, Type: c.Type, SubType: c.SubType, - ScrapedAt: time.Now(), + ScrapedAt: time.Now().UTC(), Raw: map[string]json.RawMessage{ "data": c.Data, "location": c.Location, @@ -346,7 +400,7 @@ func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { }, } - // ---- images → download → S3 → store public URL -------------------- + // Process images for i, m := range c.Medias { if m.ImageMobileWebPx2 == "" { continue @@ -354,19 +408,20 @@ func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { key := fmt.Sprintf("%s/img_%d.webp", h.ID, i) publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key) if err != nil { - log.Printf("image upload failed %s: %v", key, err) + log.Printf("Image upload failed for %s: %v", key, err) continue } h.Images = append(h.Images, publicURL) } + + log.Printf("Successfully converted card %s with %d images", cardID, len(h.Images)) return h, nil } -// --------------------------------------------------------------------- -// 6. MAIN -// --------------------------------------------------------------------- - func main() { + log.SetFlags(log.LstdFlags | log.Lshortfile) + log.Println("Starting Oikotie scraper...") + cfg := Config{ CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"), CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"), @@ -375,21 +430,32 @@ func main() { S3UseSSL: getEnvBool("S3_USE_SSL", true), } + log.Printf("Configuration: CouchDB=%s, S3=%s/%s", cfg.CouchURL, cfg.S3Endpoint, cfg.S3Bucket) + s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL) if err != nil { - log.Fatal("S3 init:", err) + log.Fatalf("S3 initialization failed: %v", err) } + log.Println("S3 client initialized successfully") couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB) + log.Println("CouchDB client initialized successfully") scraper := NewOikotieScraper(s3) + log.Println("Oikotie scraper initialized successfully") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - log.Println("Starting full Oikotie scrape …") + // Handle graceful shutdown + go func() { + <-ctx.Done() + log.Println("Shutting down...") + }() + + log.Println("Starting full Oikotie scrape...") if err := scraper.ScrapeAll(ctx, couch); err != nil { - log.Fatal("scrape failed:", err) + log.Fatalf("Scrape failed: %v", err) } - log.Println("All done!") -} + log.Println("Scraping completed successfully!") +} \ No newline at end of file -- cgit v1.2.3-70-g09d2