diff options
| author | Petri Hienonen <petri.hienonen@gmail.com> | 2025-10-29 15:18:30 +0200 |
|---|---|---|
| committer | Petri Hienonen <petri.hienonen@gmail.com> | 2025-11-03 10:54:48 +0200 |
| commit | b03ee7032b2ea2d4d22ab7ec1346b7c9331cfc17 (patch) | |
| tree | efc0ce6823ab8611d9c6a0bf27ecdbd124638b73 /scrape | |
| download | housing-b03ee7032b2ea2d4d22ab7ec1346b7c9331cfc17.tar.zst | |
Initial commit
Diffstat (limited to 'scrape')
| -rw-r--r-- | scrape/go.mod | 22 | ||||
| -rw-r--r-- | scrape/go.sum | 33 | ||||
| -rw-r--r-- | scrape/main.go | 395 | ||||
| -rw-r--r-- | scrape/requirements.tsv | 15 |
4 files changed, 465 insertions, 0 deletions
diff --git a/scrape/go.mod b/scrape/go.mod new file mode 100644 index 0000000..c7a1086 --- /dev/null +++ b/scrape/go.mod @@ -0,0 +1,22 @@ +module tammi.cc/housing + +go 1.25.2 + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/go-ini/ini v1.67.0 // indirect + github.com/goccy/go-json v0.10.5 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.18.0 // indirect + github.com/klauspost/cpuid/v2 v2.2.11 // indirect + github.com/minio/crc64nvme v1.0.2 // indirect + github.com/minio/md5-simd v1.1.2 // indirect + github.com/minio/minio-go/v7 v7.0.95 // indirect + github.com/philhofer/fwd v1.2.0 // indirect + github.com/rs/xid v1.6.0 // indirect + github.com/tinylib/msgp v1.3.0 // indirect + golang.org/x/crypto v0.39.0 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/text v0.26.0 // indirect +) diff --git a/scrape/go.sum b/scrape/go.sum new file mode 100644 index 0000000..9fdb97e --- /dev/null +++ b/scrape/go.sum @@ -0,0 +1,33 @@ +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= +github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= +github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= +github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/cpuid/v2 v2.2.11 h1:0OwqZRYI2rFrjS4kvkDnqJkKHdHaRnCm68/DY4OxRzU= +github.com/klauspost/cpuid/v2 v2.2.11/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= +github.com/minio/crc64nvme v1.0.2 h1:6uO1UxGAD+kwqWWp7mBFsi5gAse66C4NXO8cmcVculg= +github.com/minio/crc64nvme v1.0.2/go.mod h1:eVfm2fAzLlxMdUGc0EEBGSMmPwmXD5XiNRpnu9J3bvg= +github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34= +github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM= +github.com/minio/minio-go/v7 v7.0.95 h1:ywOUPg+PebTMTzn9VDsoFJy32ZuARN9zhB+K3IYEvYU= +github.com/minio/minio-go/v7 v7.0.95/go.mod h1:wOOX3uxS334vImCNRVyIDdXX9OsXDm89ToynKgqUKlo= +github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM= +github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM= +github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU= +github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= +github.com/tinylib/msgp v1.3.0 h1:ULuf7GPooDaIlbyvgAxBV/FI7ynli6LZ1/nVUNu+0ww= +github.com/tinylib/msgp v1.3.0/go.mod h1:ykjzy2wzgrlvpDCRc4LA8UXy6D8bzMSuAF3WD57Gok0= +golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= +golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= diff --git a/scrape/main.go b/scrape/main.go new file mode 100644 index 0000000..7ef5ce4 --- /dev/null +++ b/scrape/main.go @@ -0,0 +1,395 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "net/url" + "os" + "strconv" + "strings" + "time" + + "github.com/minio/minio-go/v7" + "github.com/minio/minio-go/v7/pkg/credentials" +) + +// --------------------------------------------------------------------- +// 1. CONFIG & HELPERS +// --------------------------------------------------------------------- + +type Config struct { + CouchURL string + CouchDB string + S3Endpoint string + S3Bucket string + S3UseSSL bool +} + +func getEnv(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func getEnvBool(key string, def bool) bool { + if v := os.Getenv(key); v != "" { + b, _ := strconv.ParseBool(v) + return b + } + return def +} + +// --------------------------------------------------------------------- +// 2. S3 / MINIO CLIENT (public bucket – no keys) +// --------------------------------------------------------------------- + +type S3Client struct { + client *minio.Client + bucket string +} + +func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) { + // anonymous credentials → public bucket + c, err := minio.New(endpoint, &minio.Options{ + Creds: credentials.NewStaticV4("", "", ""), + Secure: useSSL, + }) + if err != nil { + return nil, err + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + exists, err := c.BucketExists(ctx, bucket) + if err != nil || !exists { + return nil, fmt.Errorf("bucket %s not accessible", bucket) + } + return &S3Client{client: c, bucket: bucket}, nil +} + +// UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL +func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) { + resp, err := http.Get(imgURL) + if err != nil { + return "", err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("img status %d", resp.StatusCode) + } + data, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{ + ContentType: "image/webp", + }) + if err != nil { + return "", err + } + return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil +} + +// --------------------------------------------------------------------- +// 3. HOUSE MODEL +// --------------------------------------------------------------------- + +type House struct { + ID string `json:"_id"` + Rev string `json:"_rev,omitempty"` + Source string `json:"source"` + URL string `json:"url"` + Status int `json:"status"` + Type int `json:"type"` + SubType int `json:"subType"` + Images []string `json:"images"` + Raw map[string]json.RawMessage `json:"raw,omitempty"` + ScrapedAt time.Time `json:"scraped_at"` +} + +// --------------------------------------------------------------------- +// 4. COUCHDB CLIENT (simplified – only Upsert) +// --------------------------------------------------------------------- + +type CouchClient struct { + baseURL string + database string + client *http.Client +} + +func NewCouchClient(base, db string) *CouchClient { + return &CouchClient{ + baseURL: base, + database: db, + client: &http.Client{Timeout: 30 * time.Second}, + } +} + +func (c *CouchClient) Upsert(h *House) error { + body, _ := json.Marshal(h) + reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID) + req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + + resp, err := c.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b)) + } + var rev struct { + Rev string `json:"rev"` + } + json.NewDecoder(resp.Body).Decode(&rev) + h.Rev = rev.Rev + return nil +} + +// --------------------------------------------------------------------- +// 5. OIKOTIE SCRAPER (single struct – everything in main.go) +// --------------------------------------------------------------------- + +type OikotieScraper struct { + client *http.Client + s3 *S3Client + baseURL string + otaToken string + otaCuid string + otaLoaded string + phpSessID string + rateLimiter <-chan time.Time +} + +func NewOikotieScraper(s3 *S3Client) *OikotieScraper { + return &OikotieScraper{ + client: &http.Client{Timeout: 30 * time.Second}, + s3: s3, + baseURL: "https://asunnot.oikotie.fi/api/search", + rateLimiter: time.Tick(3 * time.Second), + } +} + +// ---- token handling ------------------------------------------------- +func (osi *OikotieScraper) loadTokens() { + osi.otaToken = getEnv("OTA_TOKEN", "") + osi.otaCuid = getEnv("OTA_CUID", "") + osi.otaLoaded = getEnv("OTA_LOADED", "") + osi.phpSessID = getEnv("PHPSESSID", "") + + if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" { + log.Println("Missing one or more tokens – please enter them now:") + r := bufio.NewReader(os.Stdin) + if osi.otaToken == "" { + fmt.Print("OTA-token: ") + osi.otaToken, _ = r.ReadString('\n') + osi.otaToken = strings.TrimSpace(osi.otaToken) + } + if osi.otaCuid == "" { + fmt.Print("OTA-cuid: ") + osi.otaCuid, _ = r.ReadString('\n') + osi.otaCuid = strings.TrimSpace(osi.otaCuid) + } + if osi.otaLoaded == "" { + fmt.Print("OTA-loaded: ") + osi.otaLoaded, _ = r.ReadString('\n') + osi.otaLoaded = strings.TrimSpace(osi.otaLoaded) + } + if osi.phpSessID == "" { + fmt.Print("PHPSESSID: ") + osi.phpSessID, _ = r.ReadString('\n') + osi.phpSessID = strings.TrimSpace(osi.phpSessID) + } + } +} + +// ---- main scrape loop ----------------------------------------------- +func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error { + os.loadTokens() + + limit := 24 + offset := 0 + totalSaved := 0 + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-os.rateLimiter: + cards, found, err := os.fetchPage(offset, limit) + if err != nil { + if strings.Contains(err.Error(), "401") { + log.Println("401 – re-entering tokens") + os.loadTokens() + continue + } + log.Printf("fetch error (offset %d): %v", offset, err) + time.Sleep(5 * time.Second) + continue + } + + if len(cards) == 0 { + log.Printf("No cards at offset %d – finished", offset) + break + } + + for _, c := range cards { + h, err := os.convertCard(c) + if err != nil { + log.Printf("convert error %s: %v", c.ID, err) + continue + } + if err := couch.Upsert(h); err != nil { + log.Printf("couch upsert %s: %v", h.ID, err) + } else { + totalSaved++ + } + } + + log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found) + + if offset+len(cards) >= found { + log.Printf("Reached end – %d cards saved", totalSaved) + break + } + offset += limit + } + } + return nil +} + +// ---- API call -------------------------------------------------------- +type apiCard struct { + ID string `json:"cardId"` + Type int `json:"cardType"` + SubType int `json:"cardSubType"` + URL string `json:"url"` + Status int `json:"status"` + Data json.RawMessage + Location json.RawMessage + Company json.RawMessage + Medias []struct { + ImageMobileWebPx2 string `json:"imageMobileWebPx2"` + } `json:"medias"` +} + +func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) { + q := url.Values{} + q.Add("locations", `[[64,6,"Helsinki"]]`) + for _, bt := range []string{"4", "8", "32", "128", "64", "512"} { + q.Add("buildingType[]", bt) + } + q.Add("cardType", "100") + q.Add("limit", strconv.Itoa(limit)) + q.Add("offset", strconv.Itoa(offset)) + q.Add("sortBy", "published_sort_desc") + + reqURL := os.baseURL + "?" + q.Encode() + req, _ := http.NewRequest("GET", reqURL, nil) + + // ---- headers ---------------------------------------------------- + req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0") + req.Header.Set("Accept", "application/json, text/plain, */*") + req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512") + req.Header.Set("OTA-token", os.otaToken) + req.Header.Set("OTA-cuid", os.otaCuid) + req.Header.Set("OTA-loaded", os.otaLoaded) + req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid)) + + resp, err := os.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + if resp.StatusCode == 401 { + return nil, 0, fmt.Errorf("401 Unauthorized") + } + if resp.StatusCode != 200 { + b, _ := io.ReadAll(resp.Body) + return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b)) + } + + var payload struct { + Found int `json:"found"` + Cards []apiCard `json:"cards"` + } + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return nil, 0, err + } + return payload.Cards, payload.Found, nil +} + +// ---- conversion ------------------------------------------------------- +func (os *OikotieScraper) convertCard(c apiCard) (*House, error) { + h := &House{ + ID: "oikotie_" + c.ID, + Source: "oikotie", + URL: "https://asunnot.oikotie.fi" + c.URL, + Status: c.Status, + Type: c.Type, + SubType: c.SubType, + ScrapedAt: time.Now(), + Raw: map[string]json.RawMessage{ + "data": c.Data, + "location": c.Location, + "company": c.Company, + }, + } + + // ---- images → download → S3 → store public URL -------------------- + for i, m := range c.Medias { + if m.ImageMobileWebPx2 == "" { + continue + } + key := fmt.Sprintf("%s/img_%d.webp", h.ID, i) + publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key) + if err != nil { + log.Printf("image upload failed %s: %v", key, err) + continue + } + h.Images = append(h.Images, publicURL) + } + return h, nil +} + +// --------------------------------------------------------------------- +// 6. MAIN +// --------------------------------------------------------------------- + +func main() { + cfg := Config{ + CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"), + CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"), + S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"), + S3Bucket: getEnv("S3_BUCKET", "asunnot"), + S3UseSSL: getEnvBool("S3_USE_SSL", true), + } + + s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL) + if err != nil { + log.Fatal("S3 init:", err) + } + + couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB) + + scraper := NewOikotieScraper(s3) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + log.Println("Starting full Oikotie scrape …") + if err := scraper.ScrapeAll(ctx, couch); err != nil { + log.Fatal("scrape failed:", err) + } + log.Println("All done!") +} diff --git a/scrape/requirements.tsv b/scrape/requirements.tsv new file mode 100644 index 0000000..a9b5ecf --- /dev/null +++ b/scrape/requirements.tsv @@ -0,0 +1,15 @@ +ID Category Requirement +BACK-1 Scraping Download data fom oikotie API +BACK-3 Scraping Rate limiting (3 seconds between requests) +BACK-4 Scraping No external dependencies, except minio +BACK-5 Image Processing Use webp images with high resolution +BACK-6 Image Processing Store images in MinIO S3 (s3.tammi.cc) +BACK-7 Storage CouchDB for house data +BACK-8 Storage Proper connection handling and error recovery +BACK-9 Data Management Detect new, updated, and removed houses +BACK-10 Data Management Track house timeline (appearance/disappearance) +BACK-11 Notifications ntfy.sh integration for new houses +BACK-12 Configuration Environment-based configuration +BACK-13 Operations Systemd service and timer for daily runs +BACK-15 Error Handling Comprehensive error logging and retries +BACK-19 Data Retention Keep data indefinitely |
