diff options
Diffstat (limited to 'scrape/main.go')
| -rw-r--r-- | scrape/main.go | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/scrape/main.go b/scrape/main.go new file mode 100644 index 0000000..7ef5ce4 --- /dev/null +++ b/scrape/main.go @@ -0,0 +1,395 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "strconv" + "strings" + "time" + + "github.com/minio/minio-go/v7" + "github.com/minio/minio-go/v7/pkg/credentials" +) + +// --------------------------------------------------------------------- +// 1. CONFIG & HELPERS +// --------------------------------------------------------------------- + +type Config struct { + CouchURL string + CouchDB string + S3Endpoint string + S3Bucket string + S3UseSSL bool +} + +func getEnv(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func getEnvBool(key string, def bool) bool { + if v := os.Getenv(key); v != "" { + b, _ := strconv.ParseBool(v) + return b + } + return def +} + +// --------------------------------------------------------------------- +// 2. S3 / MINIO CLIENT (public bucket – no keys) +// --------------------------------------------------------------------- + +type S3Client struct { + client *minio.Client + bucket string +} + +func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) { + // anonymous credentials → public bucket + c, err := minio.New(endpoint, &minio.Options{ + Creds: credentials.NewStaticV4("", "", ""), + Secure: useSSL, + }) + if err != nil { + return nil, err + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + exists, err := c.BucketExists(ctx, bucket) + if err != nil || !exists { + return nil, fmt.Errorf("bucket %s not accessible", bucket) + } + return &S3Client{client: c, bucket: bucket}, nil +} + +// UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL +func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) { + resp, err := http.Get(imgURL) + if err != nil { + return "", err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("img status %d", resp.StatusCode) + } + data, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ + ContentType: "image/webp", + }) + if err != nil { + return "", err + } + return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil +} + +// --------------------------------------------------------------------- +// 3. HOUSE MODEL +// --------------------------------------------------------------------- + +type House struct { + ID string `json:"_id"` + Rev string `json:"_rev,omitempty"` + Source string `json:"source"` + URL string `json:"url"` + Status int `json:"status"` + Type int `json:"type"` + SubType int `json:"subType"` + Images []string `json:"images"` + Raw map[string]json.RawMessage `json:"raw,omitempty"` + ScrapedAt time.Time `json:"scraped_at"` +} + +// --------------------------------------------------------------------- +// 4. COUCHDB CLIENT (simplified – only Upsert) +// --------------------------------------------------------------------- + +type CouchClient struct { + baseURL string + database string + client *http.Client +} + +func NewCouchClient(base, db string) *CouchClient { + return &CouchClient{ + baseURL: base, + database: db, + client: &http.Client{Timeout: 30 * time.Second}, + } +} + +func (c *CouchClient) Upsert(h *House) error { + body, _ := json.Marshal(h) + reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID) + req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b)) + } + var rev struct { + Rev string `json:"rev"` + } + json.NewDecoder(resp.Body).Decode(&rev) + h.Rev = rev.Rev + return nil +} + +// --------------------------------------------------------------------- +// 5. OIKOTIE SCRAPER (single struct – everything in main.go) +// --------------------------------------------------------------------- + +type OikotieScraper struct { + client *http.Client + s3 *S3Client + baseURL string + otaToken string + otaCuid string + otaLoaded string + phpSessID string + rateLimiter <-chan time.Time +} + +func NewOikotieScraper(s3 *S3Client) *OikotieScraper { + return &OikotieScraper{ + client: &http.Client{Timeout: 30 * time.Second}, + s3: s3, + baseURL: "https://asunnot.oikotie.fi/api/search", + rateLimiter: time.Tick(3 * time.Second), + } +} + +// ---- token handling ------------------------------------------------- +func (osi *OikotieScraper) loadTokens() { + osi.otaToken = getEnv("OTA_TOKEN", "") + osi.otaCuid = getEnv("OTA_CUID", "") + osi.otaLoaded = getEnv("OTA_LOADED", "") + osi.phpSessID = getEnv("PHPSESSID", "") + + if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" { + log.Println("Missing one or more tokens – please enter them now:") + r := bufio.NewReader(os.Stdin) + if osi.otaToken == "" { + fmt.Print("OTA-token: ") + osi.otaToken, _ = r.ReadString('\n') + osi.otaToken = strings.TrimSpace(osi.otaToken) + } + if osi.otaCuid == "" { + fmt.Print("OTA-cuid: ") + osi.otaCuid, _ = r.ReadString('\n') + osi.otaCuid = strings.TrimSpace(osi.otaCuid) + } + if osi.otaLoaded == "" { + fmt.Print("OTA-loaded: ") + osi.otaLoaded, _ = r.ReadString('\n') + osi.otaLoaded = strings.TrimSpace(osi.otaLoaded) + } + if osi.phpSessID == "" { + fmt.Print("PHPSESSID: ") + osi.phpSessID, _ = r.ReadString('\n') + osi.phpSessID = strings.TrimSpace(osi.phpSessID) + } + } +} + +// ---- main scrape loop ----------------------------------------------- +func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error { + os.loadTokens() + + limit := 24 + offset := 0 + totalSaved := 0 + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-os.rateLimiter: + cards, found, err := os.fetchPage(offset, limit) + if err != nil { + if strings.Contains(err.Error(), "401") { + log.Println("401 – re-entering tokens") + os.loadTokens() + continue + } + log.Printf("fetch error (offset %d): %v", offset, err) + time.Sleep(5 * time.Second) + continue + } + + if len(cards) == 0 { + log.Printf("No cards at offset %d – finished", offset) + break + } + + for _, c := range cards { + h, err := os.convertCard(c) + if err != nil { + log.Printf("convert error %s: %v", c.ID, err) + continue + } + if err := couch.Upsert(h); err != nil { + log.Printf("couch upsert %s: %v", h.ID, err) + } else { + totalSaved++ + } + } + + log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found) + + if offset+len(cards) >= found { + log.Printf("Reached end – %d cards saved", totalSaved) + break + } + offset += limit + } + } + return nil +} + +// ---- API call -------------------------------------------------------- +type apiCard struct { + ID string `json:"cardId"` + Type int `json:"cardType"` + SubType int `json:"cardSubType"` + URL string `json:"url"` + Status int `json:"status"` + Data json.RawMessage + Location json.RawMessage + Company json.RawMessage + Medias []struct { + ImageMobileWebPx2 string `json:"imageMobileWebPx2"` + } `json:"medias"` +} + +func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { + q := url.Values{} + q.Add("locations", `[[64,6,"Helsinki"]]`) + for _, bt := range []string{"4", "8", "32", "128", "64", "512"} { + q.Add("buildingType[]", bt) + } + q.Add("cardType", "100") + q.Add("limit", strconv.Itoa(limit)) + q.Add("offset", strconv.Itoa(offset)) + q.Add("sortBy", "published_sort_desc") + + reqURL := os.baseURL + "?" + q.Encode() + req, _ := http.NewRequest("GET", reqURL, nil) + + // ---- headers ---------------------------------------------------- + req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0") + req.Header.Set("Accept", "application/json, text/plain, */*") + req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512") + req.Header.Set("OTA-token", os.otaToken) + req.Header.Set("OTA-cuid", os.otaCuid) + req.Header.Set("OTA-loaded", os.otaLoaded) + req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid)) + + resp, err := os.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + if resp.StatusCode == 401 { + return nil, 0, fmt.Errorf("401 Unauthorized") + } + if resp.StatusCode != 200 { + b, _ := io.ReadAll(resp.Body) + return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b)) + } + + var payload struct { + Found int `json:"found"` + Cards []apiCard `json:"cards"` + } + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return nil, 0, err + } + return payload.Cards, payload.Found, nil +} + +// ---- conversion ------------------------------------------------------- +func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { + h := &House{ + ID: "oikotie_" + c.ID, + Source: "oikotie", + URL: "https://asunnot.oikotie.fi" + c.URL, + Status: c.Status, + Type: c.Type, + SubType: c.SubType, + ScrapedAt: time.Now(), + Raw: map[string]json.RawMessage{ + "data": c.Data, + "location": c.Location, + "company": c.Company, + }, + } + + // ---- images → download → S3 → store public URL -------------------- + for i, m := range c.Medias { + if m.ImageMobileWebPx2 == "" { + continue + } + key := fmt.Sprintf("%s/img_%d.webp", h.ID, i) + publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key) + if err != nil { + log.Printf("image upload failed %s: %v", key, err) + continue + } + h.Images = append(h.Images, publicURL) + } + return h, nil +} + +// --------------------------------------------------------------------- +// 6. MAIN +// --------------------------------------------------------------------- + +func main() { + cfg := Config{ + CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"), + CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"), + S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"), + S3Bucket: getEnv("S3_BUCKET", "asunnot"), + S3UseSSL: getEnvBool("S3_USE_SSL", true), + } + + s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL) + if err != nil { + log.Fatal("S3 init:", err) + } + + couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB) + + scraper := NewOikotieScraper(s3) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + log.Println("Starting full Oikotie scrape …") + if err := scraper.ScrapeAll(ctx, couch); err != nil { + log.Fatal("scrape failed:", err) + } + log.Println("All done!") +} |
