aboutsummaryrefslogtreecommitdiffstats
path: root/scrape/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape/main.go')
-rw-r--r--scrape/main.go248
1 files changed, 157 insertions, 91 deletions
diff --git a/scrape/main.go b/scrape/main.go
index 7ef5ce4..fed0397 100644
--- a/scrape/main.go
+++ b/scrape/main.go
@@ -19,10 +19,6 @@ import (
"github.com/minio/minio-go/v7/pkg/credentials"
)
-// ---------------------------------------------------------------------
-// 1. CONFIG & HELPERS
-// ---------------------------------------------------------------------
-
type Config struct {
CouchURL string
CouchDB string
@@ -46,10 +42,6 @@ func getEnvBool(key string, def bool) bool {
return def
}
-// ---------------------------------------------------------------------
-// 2. S3 / MINIO CLIENT (public bucket – no keys)
-// ---------------------------------------------------------------------
-
type S3Client struct {
client *minio.Client
bucket string
@@ -62,44 +54,53 @@ func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) {
Secure: useSSL,
})
if err != nil {
- return nil, err
+ return nil, fmt.Errorf("minio client creation failed: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
exists, err := c.BucketExists(ctx, bucket)
- if err != nil || !exists {
- return nil, fmt.Errorf("bucket %s not accessible", bucket)
+ if err != nil {
+ return nil, fmt.Errorf("bucket check failed: %w", err)
+ }
+ if !exists {
+ return nil, fmt.Errorf("bucket %s does not exist", bucket)
}
return &S3Client{client: c, bucket: bucket}, nil
}
// UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL
func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) {
+ log.Printf("Downloading image: %s", imgURL)
resp, err := http.Get(imgURL)
if err != nil {
- return "", err
+ return "", fmt.Errorf("HTTP GET failed: %w", err)
}
defer resp.Body.Close()
+
if resp.StatusCode != http.StatusOK {
- return "", fmt.Errorf("img status %d", resp.StatusCode)
+ return "", fmt.Errorf("image download failed with status %d", resp.StatusCode)
}
+
data, err := io.ReadAll(resp.Body)
if err != nil {
- return "", err
+ return "", fmt.Errorf("reading response body failed: %w", err)
}
- _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{
+
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+
+ _, err = s.client.PutObject(ctx, s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{
ContentType: "image/webp",
})
if err != nil {
- return "", err
+ return "", fmt.Errorf("S3 upload failed: %w", err)
}
- return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil
+
+ publicURL := fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key)
+ log.Printf("Successfully uploaded image: %s", publicURL)
+ return publicURL, nil
}
-// ---------------------------------------------------------------------
-// 3. HOUSE MODEL
-// ---------------------------------------------------------------------
-
type House struct {
ID string `json:"_id"`
Rev string `json:"_rev,omitempty"`
@@ -113,10 +114,6 @@ type House struct {
ScrapedAt time.Time `json:"scraped_at"`
}
-// ---------------------------------------------------------------------
-// 4. COUCHDB CLIENT (simplified – only Upsert)
-// ---------------------------------------------------------------------
-
type CouchClient struct {
baseURL string
database string
@@ -132,33 +129,41 @@ func NewCouchClient(base, db string) *CouchClient {
}
func (c *CouchClient) Upsert(h *House) error {
- body, _ := json.Marshal(h)
- reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID)
- req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body))
+ body, err := json.Marshal(h)
+ if err != nil {
+ return fmt.Errorf("JSON marshal failed: %w", err)
+ }
+
+ reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, url.PathEscape(h.ID))
+ req, err := http.NewRequest("PUT", reqURL, bytes.NewReader(body))
+ if err != nil {
+ return fmt.Errorf("request creation failed: %w", err)
+ }
+
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
resp, err := c.client.Do(req)
if err != nil {
- return err
+ return fmt.Errorf("HTTP request failed: %w", err)
}
defer resp.Body.Close()
+
if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK {
- b, _ := io.ReadAll(resp.Body)
- return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b))
+ body, _ := io.ReadAll(resp.Body)
+ return fmt.Errorf("couchDB responded with status %d: %s", resp.StatusCode, string(body))
}
+
var rev struct {
Rev string `json:"rev"`
}
- json.NewDecoder(resp.Body).Decode(&rev)
+ if err := json.NewDecoder(resp.Body).Decode(&rev); err != nil {
+ return fmt.Errorf("decoding response failed: %w", err)
+ }
h.Rev = rev.Rev
return nil
}
-// ---------------------------------------------------------------------
-// 5. OIKOTIE SCRAPER (single struct – everything in main.go)
-// ---------------------------------------------------------------------
-
type OikotieScraper struct {
client *http.Client
s3 *S3Client
@@ -179,8 +184,7 @@ func NewOikotieScraper(s3 *S3Client) *OikotieScraper {
}
}
-// ---- token handling -------------------------------------------------
-func (osi *OikotieScraper) loadTokens() {
+func (osi *OikotieScraper) loadTokens() error {
osi.otaToken = getEnv("OTA_TOKEN", "")
osi.otaCuid = getEnv("OTA_CUID", "")
osi.otaLoaded = getEnv("OTA_LOADED", "")
@@ -189,94 +193,132 @@ func (osi *OikotieScraper) loadTokens() {
if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" {
log.Println("Missing one or more tokens – please enter them now:")
r := bufio.NewReader(os.Stdin)
+
if osi.otaToken == "" {
fmt.Print("OTA-token: ")
- osi.otaToken, _ = r.ReadString('\n')
- osi.otaToken = strings.TrimSpace(osi.otaToken)
+ token, err := r.ReadString('\n')
+ if err != nil {
+ return fmt.Errorf("reading OTA-token failed: %w", err)
+ }
+ osi.otaToken = strings.TrimSpace(token)
}
+
if osi.otaCuid == "" {
fmt.Print("OTA-cuid: ")
- osi.otaCuid, _ = r.ReadString('\n')
- osi.otaCuid = strings.TrimSpace(osi.otaCuid)
+ cuid, err := r.ReadString('\n')
+ if err != nil {
+ return fmt.Errorf("reading OTA-cuid failed: %w", err)
+ }
+ osi.otaCuid = strings.TrimSpace(cuid)
}
+
if osi.otaLoaded == "" {
fmt.Print("OTA-loaded: ")
- osi.otaLoaded, _ = r.ReadString('\n')
- osi.otaLoaded = strings.TrimSpace(osi.otaLoaded)
+ loaded, err := r.ReadString('\n')
+ if err != nil {
+ return fmt.Errorf("reading OTA-loaded failed: %w", err)
+ }
+ osi.otaLoaded = strings.TrimSpace(loaded)
}
+
if osi.phpSessID == "" {
fmt.Print("PHPSESSID: ")
- osi.phpSessID, _ = r.ReadString('\n')
- osi.phpSessID = strings.TrimSpace(osi.phpSessID)
+ sessID, err := r.ReadString('\n')
+ if err != nil {
+ return fmt.Errorf("reading PHPSESSID failed: %w", err)
+ }
+ osi.phpSessID = strings.TrimSpace(sessID)
}
}
+ return nil
}
-// ---- main scrape loop -----------------------------------------------
func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error {
- os.loadTokens()
+ if err := os.loadTokens(); err != nil {
+ return fmt.Errorf("loading tokens failed: %w", err)
+ }
limit := 24
offset := 0
totalSaved := 0
+ retryCount := 0
+ maxRetries := 3
for {
select {
case <-ctx.Done():
+ log.Println("Context cancelled, stopping scrape")
return ctx.Err()
case <-os.rateLimiter:
+ log.Printf("Fetching page with offset %d, limit %d", offset, limit)
cards, found, err := os.fetchPage(offset, limit)
if err != nil {
if strings.Contains(err.Error(), "401") {
- log.Println("401 – re-entering tokens")
- os.loadTokens()
+ log.Println("401 Unauthorized – re-entering tokens")
+ if err := os.loadTokens(); err != nil {
+ return fmt.Errorf("reloading tokens failed: %w", err)
+ }
+ retryCount++
+ if retryCount > maxRetries {
+ return fmt.Errorf("max retries reached for authentication")
+ }
continue
}
- log.Printf("fetch error (offset %d): %v", offset, err)
+
+ log.Printf("Fetch error (offset %d): %v", offset, err)
+ retryCount++
+ if retryCount > maxRetries {
+ return fmt.Errorf("max retries reached for fetching")
+ }
time.Sleep(5 * time.Second)
continue
}
+
+ retryCount = 0 // Reset retry count on successful fetch
if len(cards) == 0 {
- log.Printf("No cards at offset %d – finished", offset)
- break
+ log.Printf("No cards found at offset %d – finished scraping", offset)
+ return nil
}
- for _, c := range cards {
+ savedInBatch := 0
+ for i, c := range cards {
+ log.Printf("Processing card %d/%d: %s", i+1, len(cards), c.ID)
h, err := os.convertCard(c)
if err != nil {
- log.Printf("convert error %s: %v", c.ID, err)
+ log.Printf("Convert error for card %s: %v", c.ID, err)
continue
}
if err := couch.Upsert(h); err != nil {
- log.Printf("couch upsert %s: %v", h.ID, err)
+ log.Printf("CouchDB upsert failed for %s: %v", h.ID, err)
} else {
totalSaved++
+ savedInBatch++
}
}
- log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found)
+ log.Printf("Batch %d-%d: %d/%d cards saved (total: %d, found: %d)",
+ offset, offset+len(cards)-1, savedInBatch, len(cards), totalSaved, found)
if offset+len(cards) >= found {
- log.Printf("Reached end – %d cards saved", totalSaved)
- break
+ log.Printf("Reached end of results – %d cards saved in total", totalSaved)
+ return nil
}
offset += limit
}
}
- return nil
}
-// ---- API call --------------------------------------------------------
+// Fixed: cardId can be number or string, so use json.Number
type apiCard struct {
- ID string `json:"cardId"`
- Type int `json:"cardType"`
- SubType int `json:"cardSubType"`
- URL string `json:"url"`
- Status int `json:"status"`
- Data json.RawMessage
- Location json.RawMessage
- Company json.RawMessage
+ ID json.Number `json:"cardId"`
+ Type int `json:"cardType"`
+ SubType int `json:"cardSubType"`
+ URL string `json:"url"`
+ Status int `json:"status"`
+ Data json.RawMessage `json:"data"`
+ Location json.RawMessage `json:"location"`
+ Company json.RawMessage `json:"company"`
Medias []struct {
ImageMobileWebPx2 string `json:"imageMobileWebPx2"`
} `json:"medias"`
@@ -294,9 +336,11 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) {
q.Add("sortBy", "published_sort_desc")
reqURL := os.baseURL + "?" + q.Encode()
- req, _ := http.NewRequest("GET", reqURL, nil)
+ req, err := http.NewRequest("GET", reqURL, nil)
+ if err != nil {
+ return nil, 0, fmt.Errorf("creating request failed: %w", err)
+ }
- // ---- headers ----------------------------------------------------
req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0")
req.Header.Set("Accept", "application/json, text/plain, */*")
req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512")
@@ -307,7 +351,7 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) {
resp, err := os.client.Do(req)
if err != nil {
- return nil, 0, err
+ return nil, 0, fmt.Errorf("HTTP request failed: %w", err)
}
defer resp.Body.Close()
@@ -315,30 +359,40 @@ func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) {
return nil, 0, fmt.Errorf("401 Unauthorized")
}
if resp.StatusCode != 200 {
- b, _ := io.ReadAll(resp.Body)
- return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b))
+ body, _ := io.ReadAll(resp.Body)
+ return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(body))
}
var payload struct {
Found int `json:"found"`
Cards []apiCard `json:"cards"`
}
- if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
- return nil, 0, err
+
+ // Read the body first for better error reporting
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, 0, fmt.Errorf("reading response body failed: %w", err)
+ }
+
+ if err := json.Unmarshal(body, &payload); err != nil {
+ log.Printf("Raw response: %s", string(body))
+ return nil, 0, fmt.Errorf("JSON unmarshal failed: %w", err)
}
return payload.Cards, payload.Found, nil
}
-// ---- conversion -------------------------------------------------------
func (os *OikotieScraper) convertCard(c apiCard) (*House, error) {
+ // Convert json.Number to string for the ID
+ cardID := c.ID.String()
+
h := &House{
- ID: "oikotie_" + c.ID,
+ ID: "oikotie_" + cardID,
Source: "oikotie",
- URL: "https://asunnot.oikotie.fi" + c.URL,
+ URL: c.URL,
Status: c.Status,
Type: c.Type,
SubType: c.SubType,
- ScrapedAt: time.Now(),
+ ScrapedAt: time.Now().UTC(),
Raw: map[string]json.RawMessage{
"data": c.Data,
"location": c.Location,
@@ -346,7 +400,7 @@ func (os *OikotieScraper) convertCard(c apiCard) (*House, error) {
},
}
- // ---- images → download → S3 → store public URL --------------------
+ // Process images
for i, m := range c.Medias {
if m.ImageMobileWebPx2 == "" {
continue
@@ -354,19 +408,20 @@ func (os *OikotieScraper) convertCard(c apiCard) (*House, error) {
key := fmt.Sprintf("%s/img_%d.webp", h.ID, i)
publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key)
if err != nil {
- log.Printf("image upload failed %s: %v", key, err)
+ log.Printf("Image upload failed for %s: %v", key, err)
continue
}
h.Images = append(h.Images, publicURL)
}
+
+ log.Printf("Successfully converted card %s with %d images", cardID, len(h.Images))
return h, nil
}
-// ---------------------------------------------------------------------
-// 6. MAIN
-// ---------------------------------------------------------------------
-
func main() {
+ log.SetFlags(log.LstdFlags | log.Lshortfile)
+ log.Println("Starting Oikotie scraper...")
+
cfg := Config{
CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"),
CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"),
@@ -375,21 +430,32 @@ func main() {
S3UseSSL: getEnvBool("S3_USE_SSL", true),
}
+ log.Printf("Configuration: CouchDB=%s, S3=%s/%s", cfg.CouchURL, cfg.S3Endpoint, cfg.S3Bucket)
+
s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL)
if err != nil {
- log.Fatal("S3 init:", err)
+ log.Fatalf("S3 initialization failed: %v", err)
}
+ log.Println("S3 client initialized successfully")
couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB)
+ log.Println("CouchDB client initialized successfully")
scraper := NewOikotieScraper(s3)
+ log.Println("Oikotie scraper initialized successfully")
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
- log.Println("Starting full Oikotie scrape …")
+ // Handle graceful shutdown
+ go func() {
+ <-ctx.Done()
+ log.Println("Shutting down...")
+ }()
+
+ log.Println("Starting full Oikotie scrape...")
if err := scraper.ScrapeAll(ctx, couch); err != nil {
- log.Fatal("scrape failed:", err)
+ log.Fatalf("Scrape failed: %v", err)
}
- log.Println("All done!")
-}
+ log.Println("Scraping completed successfully!")
+} \ No newline at end of file