aboutsummaryrefslogtreecommitdiffstats
path: root/scrape/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape/main.go')
-rw-r--r--scrape/main.go395
1 files changed, 395 insertions, 0 deletions
diff --git a/scrape/main.go b/scrape/main.go
new file mode 100644
index 0000000..7ef5ce4
--- /dev/null
+++ b/scrape/main.go
@@ -0,0 +1,395 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/minio/minio-go/v7"
+ "github.com/minio/minio-go/v7/pkg/credentials"
+)
+
+// ---------------------------------------------------------------------
+// 1. CONFIG & HELPERS
+// ---------------------------------------------------------------------
+
+type Config struct {
+ CouchURL string
+ CouchDB string
+ S3Endpoint string
+ S3Bucket string
+ S3UseSSL bool
+}
+
+func getEnv(key, def string) string {
+ if v := os.Getenv(key); v != "" {
+ return v
+ }
+ return def
+}
+
+func getEnvBool(key string, def bool) bool {
+ if v := os.Getenv(key); v != "" {
+ b, _ := strconv.ParseBool(v)
+ return b
+ }
+ return def
+}
+
+// ---------------------------------------------------------------------
+// 2. S3 / MINIO CLIENT (public bucket – no keys)
+// ---------------------------------------------------------------------
+
+type S3Client struct {
+ client *minio.Client
+ bucket string
+}
+
+func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) {
+ // anonymous credentials → public bucket
+ c, err := minio.New(endpoint, &minio.Options{
+ Creds: credentials.NewStaticV4("", "", ""),
+ Secure: useSSL,
+ })
+ if err != nil {
+ return nil, err
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ exists, err := c.BucketExists(ctx, bucket)
+ if err != nil || !exists {
+ return nil, fmt.Errorf("bucket %s not accessible", bucket)
+ }
+ return &S3Client{client: c, bucket: bucket}, nil
+}
+
+// UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL
+func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) {
+ resp, err := http.Get(imgURL)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("img status %d", resp.StatusCode)
+ }
+ data, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", err
+ }
+ _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{
+ ContentType: "image/webp",
+ })
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil
+}
+
+// ---------------------------------------------------------------------
+// 3. HOUSE MODEL
+// ---------------------------------------------------------------------
+
+type House struct {
+ ID string `json:"_id"`
+ Rev string `json:"_rev,omitempty"`
+ Source string `json:"source"`
+ URL string `json:"url"`
+ Status int `json:"status"`
+ Type int `json:"type"`
+ SubType int `json:"subType"`
+ Images []string `json:"images"`
+ Raw map[string]json.RawMessage `json:"raw,omitempty"`
+ ScrapedAt time.Time `json:"scraped_at"`
+}
+
+// ---------------------------------------------------------------------
+// 4. COUCHDB CLIENT (simplified – only Upsert)
+// ---------------------------------------------------------------------
+
+type CouchClient struct {
+ baseURL string
+ database string
+ client *http.Client
+}
+
+func NewCouchClient(base, db string) *CouchClient {
+ return &CouchClient{
+ baseURL: base,
+ database: db,
+ client: &http.Client{Timeout: 30 * time.Second},
+ }
+}
+
+func (c *CouchClient) Upsert(h *House) error {
+ body, _ := json.Marshal(h)
+ reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID)
+ req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+
+ resp, err := c.client.Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK {
+ b, _ := io.ReadAll(resp.Body)
+ return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b))
+ }
+ var rev struct {
+ Rev string `json:"rev"`
+ }
+ json.NewDecoder(resp.Body).Decode(&rev)
+ h.Rev = rev.Rev
+ return nil
+}
+
+// ---------------------------------------------------------------------
+// 5. OIKOTIE SCRAPER (single struct – everything in main.go)
+// ---------------------------------------------------------------------
+
+type OikotieScraper struct {
+ client *http.Client
+ s3 *S3Client
+ baseURL string
+ otaToken string
+ otaCuid string
+ otaLoaded string
+ phpSessID string
+ rateLimiter <-chan time.Time
+}
+
+func NewOikotieScraper(s3 *S3Client) *OikotieScraper {
+ return &OikotieScraper{
+ client: &http.Client{Timeout: 30 * time.Second},
+ s3: s3,
+ baseURL: "https://asunnot.oikotie.fi/api/search",
+ rateLimiter: time.Tick(3 * time.Second),
+ }
+}
+
+// ---- token handling -------------------------------------------------
+func (osi *OikotieScraper) loadTokens() {
+ osi.otaToken = getEnv("OTA_TOKEN", "")
+ osi.otaCuid = getEnv("OTA_CUID", "")
+ osi.otaLoaded = getEnv("OTA_LOADED", "")
+ osi.phpSessID = getEnv("PHPSESSID", "")
+
+ if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" {
+ log.Println("Missing one or more tokens – please enter them now:")
+ r := bufio.NewReader(os.Stdin)
+ if osi.otaToken == "" {
+ fmt.Print("OTA-token: ")
+ osi.otaToken, _ = r.ReadString('\n')
+ osi.otaToken = strings.TrimSpace(osi.otaToken)
+ }
+ if osi.otaCuid == "" {
+ fmt.Print("OTA-cuid: ")
+ osi.otaCuid, _ = r.ReadString('\n')
+ osi.otaCuid = strings.TrimSpace(osi.otaCuid)
+ }
+ if osi.otaLoaded == "" {
+ fmt.Print("OTA-loaded: ")
+ osi.otaLoaded, _ = r.ReadString('\n')
+ osi.otaLoaded = strings.TrimSpace(osi.otaLoaded)
+ }
+ if osi.phpSessID == "" {
+ fmt.Print("PHPSESSID: ")
+ osi.phpSessID, _ = r.ReadString('\n')
+ osi.phpSessID = strings.TrimSpace(osi.phpSessID)
+ }
+ }
+}
+
+// ---- main scrape loop -----------------------------------------------
+func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error {
+ os.loadTokens()
+
+ limit := 24
+ offset := 0
+ totalSaved := 0
+
+ for {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-os.rateLimiter:
+ cards, found, err := os.fetchPage(offset, limit)
+ if err != nil {
+ if strings.Contains(err.Error(), "401") {
+ log.Println("401 – re-entering tokens")
+ os.loadTokens()
+ continue
+ }
+ log.Printf("fetch error (offset %d): %v", offset, err)
+ time.Sleep(5 * time.Second)
+ continue
+ }
+
+ if len(cards) == 0 {
+ log.Printf("No cards at offset %d – finished", offset)
+ break
+ }
+
+ for _, c := range cards {
+ h, err := os.convertCard(c)
+ if err != nil {
+ log.Printf("convert error %s: %v", c.ID, err)
+ continue
+ }
+ if err := couch.Upsert(h); err != nil {
+ log.Printf("couch upsert %s: %v", h.ID, err)
+ } else {
+ totalSaved++
+ }
+ }
+
+ log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found)
+
+ if offset+len(cards) >= found {
+ log.Printf("Reached end – %d cards saved", totalSaved)
+ break
+ }
+ offset += limit
+ }
+ }
+ return nil
+}
+
+// ---- API call --------------------------------------------------------
+type apiCard struct {
+ ID string `json:"cardId"`
+ Type int `json:"cardType"`
+ SubType int `json:"cardSubType"`
+ URL string `json:"url"`
+ Status int `json:"status"`
+ Data json.RawMessage
+ Location json.RawMessage
+ Company json.RawMessage
+ Medias []struct {
+ ImageMobileWebPx2 string `json:"imageMobileWebPx2"`
+ } `json:"medias"`
+}
+
+func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) {
+ q := url.Values{}
+ q.Add("locations", `[[64,6,"Helsinki"]]`)
+ for _, bt := range []string{"4", "8", "32", "128", "64", "512"} {
+ q.Add("buildingType[]", bt)
+ }
+ q.Add("cardType", "100")
+ q.Add("limit", strconv.Itoa(limit))
+ q.Add("offset", strconv.Itoa(offset))
+ q.Add("sortBy", "published_sort_desc")
+
+ reqURL := os.baseURL + "?" + q.Encode()
+ req, _ := http.NewRequest("GET", reqURL, nil)
+
+ // ---- headers ----------------------------------------------------
+ req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0")
+ req.Header.Set("Accept", "application/json, text/plain, */*")
+ req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512")
+ req.Header.Set("OTA-token", os.otaToken)
+ req.Header.Set("OTA-cuid", os.otaCuid)
+ req.Header.Set("OTA-loaded", os.otaLoaded)
+ req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid))
+
+ resp, err := os.client.Do(req)
+ if err != nil {
+ return nil, 0, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode == 401 {
+ return nil, 0, fmt.Errorf("401 Unauthorized")
+ }
+ if resp.StatusCode != 200 {
+ b, _ := io.ReadAll(resp.Body)
+ return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b))
+ }
+
+ var payload struct {
+ Found int `json:"found"`
+ Cards []apiCard `json:"cards"`
+ }
+ if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
+ return nil, 0, err
+ }
+ return payload.Cards, payload.Found, nil
+}
+
+// ---- conversion -------------------------------------------------------
+func (os *OikotieScraper) convertCard(c apiCard) (*House, error) {
+ h := &House{
+ ID: "oikotie_" + c.ID,
+ Source: "oikotie",
+ URL: "https://asunnot.oikotie.fi" + c.URL,
+ Status: c.Status,
+ Type: c.Type,
+ SubType: c.SubType,
+ ScrapedAt: time.Now(),
+ Raw: map[string]json.RawMessage{
+ "data": c.Data,
+ "location": c.Location,
+ "company": c.Company,
+ },
+ }
+
+ // ---- images → download → S3 → store public URL --------------------
+ for i, m := range c.Medias {
+ if m.ImageMobileWebPx2 == "" {
+ continue
+ }
+ key := fmt.Sprintf("%s/img_%d.webp", h.ID, i)
+ publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key)
+ if err != nil {
+ log.Printf("image upload failed %s: %v", key, err)
+ continue
+ }
+ h.Images = append(h.Images, publicURL)
+ }
+ return h, nil
+}
+
+// ---------------------------------------------------------------------
+// 6. MAIN
+// ---------------------------------------------------------------------
+
+func main() {
+ cfg := Config{
+ CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"),
+ CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"),
+ S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"),
+ S3Bucket: getEnv("S3_BUCKET", "asunnot"),
+ S3UseSSL: getEnvBool("S3_USE_SSL", true),
+ }
+
+ s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL)
+ if err != nil {
+ log.Fatal("S3 init:", err)
+ }
+
+ couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB)
+
+ scraper := NewOikotieScraper(s3)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ log.Println("Starting full Oikotie scrape …")
+ if err := scraper.ScrapeAll(ctx, couch); err != nil {
+ log.Fatal("scrape failed:", err)
+ }
+ log.Println("All done!")
+}