aboutsummaryrefslogtreecommitdiffstats
path: root/scrape
diff options
context:
space:
mode:
Diffstat (limited to 'scrape')
-rw-r--r--scrape/go.mod22
-rw-r--r--scrape/go.sum33
-rw-r--r--scrape/main.go395
-rw-r--r--scrape/requirements.tsv15
4 files changed, 465 insertions, 0 deletions
diff --git a/scrape/go.mod b/scrape/go.mod
new file mode 100644
index 0000000..c7a1086
--- /dev/null
+++ b/scrape/go.mod
@@ -0,0 +1,22 @@
+module tammi.cc/housing
+
+go 1.25.2
+
+require (
+ github.com/dustin/go-humanize v1.0.1 // indirect
+ github.com/go-ini/ini v1.67.0 // indirect
+ github.com/goccy/go-json v0.10.5 // indirect
+ github.com/google/uuid v1.6.0 // indirect
+ github.com/klauspost/compress v1.18.0 // indirect
+ github.com/klauspost/cpuid/v2 v2.2.11 // indirect
+ github.com/minio/crc64nvme v1.0.2 // indirect
+ github.com/minio/md5-simd v1.1.2 // indirect
+ github.com/minio/minio-go/v7 v7.0.95 // indirect
+ github.com/philhofer/fwd v1.2.0 // indirect
+ github.com/rs/xid v1.6.0 // indirect
+ github.com/tinylib/msgp v1.3.0 // indirect
+ golang.org/x/crypto v0.39.0 // indirect
+ golang.org/x/net v0.41.0 // indirect
+ golang.org/x/sys v0.33.0 // indirect
+ golang.org/x/text v0.26.0 // indirect
+)
diff --git a/scrape/go.sum b/scrape/go.sum
new file mode 100644
index 0000000..9fdb97e
--- /dev/null
+++ b/scrape/go.sum
@@ -0,0 +1,33 @@
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
+github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
+github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
+github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
+github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
+github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/klauspost/cpuid/v2 v2.2.11 h1:0OwqZRYI2rFrjS4kvkDnqJkKHdHaRnCm68/DY4OxRzU=
+github.com/klauspost/cpuid/v2 v2.2.11/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
+github.com/minio/crc64nvme v1.0.2 h1:6uO1UxGAD+kwqWWp7mBFsi5gAse66C4NXO8cmcVculg=
+github.com/minio/crc64nvme v1.0.2/go.mod h1:eVfm2fAzLlxMdUGc0EEBGSMmPwmXD5XiNRpnu9J3bvg=
+github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
+github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
+github.com/minio/minio-go/v7 v7.0.95 h1:ywOUPg+PebTMTzn9VDsoFJy32ZuARN9zhB+K3IYEvYU=
+github.com/minio/minio-go/v7 v7.0.95/go.mod h1:wOOX3uxS334vImCNRVyIDdXX9OsXDm89ToynKgqUKlo=
+github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
+github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
+github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
+github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
+github.com/tinylib/msgp v1.3.0 h1:ULuf7GPooDaIlbyvgAxBV/FI7ynli6LZ1/nVUNu+0ww=
+github.com/tinylib/msgp v1.3.0/go.mod h1:ykjzy2wzgrlvpDCRc4LA8UXy6D8bzMSuAF3WD57Gok0=
+golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
+golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
+golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
+golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
+golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw=
+golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M=
+golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA=
diff --git a/scrape/main.go b/scrape/main.go
new file mode 100644
index 0000000..7ef5ce4
--- /dev/null
+++ b/scrape/main.go
@@ -0,0 +1,395 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/json"
+ "fmt"
+ "io"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/minio/minio-go/v7"
+ "github.com/minio/minio-go/v7/pkg/credentials"
+)
+
+// ---------------------------------------------------------------------
+// 1. CONFIG & HELPERS
+// ---------------------------------------------------------------------
+
+type Config struct {
+ CouchURL string
+ CouchDB string
+ S3Endpoint string
+ S3Bucket string
+ S3UseSSL bool
+}
+
+func getEnv(key, def string) string {
+ if v := os.Getenv(key); v != "" {
+ return v
+ }
+ return def
+}
+
+func getEnvBool(key string, def bool) bool {
+ if v := os.Getenv(key); v != "" {
+ b, _ := strconv.ParseBool(v)
+ return b
+ }
+ return def
+}
+
+// ---------------------------------------------------------------------
+// 2. S3 / MINIO CLIENT (public bucket – no keys)
+// ---------------------------------------------------------------------
+
+type S3Client struct {
+ client *minio.Client
+ bucket string
+}
+
+func NewS3Client(endpoint, bucket string, useSSL bool) (*S3Client, error) {
+ // anonymous credentials → public bucket
+ c, err := minio.New(endpoint, &minio.Options{
+ Creds: credentials.NewStaticV4("", "", ""),
+ Secure: useSSL,
+ })
+ if err != nil {
+ return nil, err
+ }
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ exists, err := c.BucketExists(ctx, bucket)
+ if err != nil || !exists {
+ return nil, fmt.Errorf("bucket %s not accessible", bucket)
+ }
+ return &S3Client{client: c, bucket: bucket}, nil
+}
+
+// UploadFromURL downloads a remote image, puts it in the bucket and returns the public URL
+func (s *S3Client) UploadFromURL(imgURL, key string) (string, error) {
+ resp, err := http.Get(imgURL)
+ if err != nil {
+ return "", err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("img status %d", resp.StatusCode)
+ }
+ data, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", err
+ }
+ _, err = s.client.PutObject(context.Background(), s.bucket, key, bytes.NewReader(data), int64(len(data)), minio.PutObjectOptions{
+ ContentType: "image/webp",
+ })
+ if err != nil {
+ return "", err
+ }
+ return fmt.Sprintf("https://%s/%s/%s", s.client.EndpointURL().Host, s.bucket, key), nil
+}
+
+// ---------------------------------------------------------------------
+// 3. HOUSE MODEL
+// ---------------------------------------------------------------------
+
+type House struct {
+ ID string `json:"_id"`
+ Rev string `json:"_rev,omitempty"`
+ Source string `json:"source"`
+ URL string `json:"url"`
+ Status int `json:"status"`
+ Type int `json:"type"`
+ SubType int `json:"subType"`
+ Images []string `json:"images"`
+ Raw map[string]json.RawMessage `json:"raw,omitempty"`
+ ScrapedAt time.Time `json:"scraped_at"`
+}
+
+// ---------------------------------------------------------------------
+// 4. COUCHDB CLIENT (simplified – only Upsert)
+// ---------------------------------------------------------------------
+
+type CouchClient struct {
+ baseURL string
+ database string
+ client *http.Client
+}
+
+func NewCouchClient(base, db string) *CouchClient {
+ return &CouchClient{
+ baseURL: base,
+ database: db,
+ client: &http.Client{Timeout: 30 * time.Second},
+ }
+}
+
+func (c *CouchClient) Upsert(h *House) error {
+ body, _ := json.Marshal(h)
+ reqURL := fmt.Sprintf("%s/%s/%s", c.baseURL, c.database, h.ID)
+ req, _ := http.NewRequest("PUT", reqURL, bytes.NewReader(body))
+ req.Header.Set("Content-Type", "application/json")
+ req.Header.Set("Accept", "application/json")
+
+ resp, err := c.client.Do(req)
+ if err != nil {
+ return err
+ }
+ defer resp.Body.Close()
+ if resp.StatusCode != http.StatusCreated && resp.StatusCode != http.StatusOK {
+ b, _ := io.ReadAll(resp.Body)
+ return fmt.Errorf("couch %d: %s", resp.StatusCode, string(b))
+ }
+ var rev struct {
+ Rev string `json:"rev"`
+ }
+ json.NewDecoder(resp.Body).Decode(&rev)
+ h.Rev = rev.Rev
+ return nil
+}
+
+// ---------------------------------------------------------------------
+// 5. OIKOTIE SCRAPER (single struct – everything in main.go)
+// ---------------------------------------------------------------------
+
+type OikotieScraper struct {
+ client *http.Client
+ s3 *S3Client
+ baseURL string
+ otaToken string
+ otaCuid string
+ otaLoaded string
+ phpSessID string
+ rateLimiter <-chan time.Time
+}
+
+func NewOikotieScraper(s3 *S3Client) *OikotieScraper {
+ return &OikotieScraper{
+ client: &http.Client{Timeout: 30 * time.Second},
+ s3: s3,
+ baseURL: "https://asunnot.oikotie.fi/api/search",
+ rateLimiter: time.Tick(3 * time.Second),
+ }
+}
+
+// ---- token handling -------------------------------------------------
+func (osi *OikotieScraper) loadTokens() {
+ osi.otaToken = getEnv("OTA_TOKEN", "")
+ osi.otaCuid = getEnv("OTA_CUID", "")
+ osi.otaLoaded = getEnv("OTA_LOADED", "")
+ osi.phpSessID = getEnv("PHPSESSID", "")
+
+ if osi.otaToken == "" || osi.otaCuid == "" || osi.otaLoaded == "" || osi.phpSessID == "" {
+ log.Println("Missing one or more tokens – please enter them now:")
+ r := bufio.NewReader(os.Stdin)
+ if osi.otaToken == "" {
+ fmt.Print("OTA-token: ")
+ osi.otaToken, _ = r.ReadString('\n')
+ osi.otaToken = strings.TrimSpace(osi.otaToken)
+ }
+ if osi.otaCuid == "" {
+ fmt.Print("OTA-cuid: ")
+ osi.otaCuid, _ = r.ReadString('\n')
+ osi.otaCuid = strings.TrimSpace(osi.otaCuid)
+ }
+ if osi.otaLoaded == "" {
+ fmt.Print("OTA-loaded: ")
+ osi.otaLoaded, _ = r.ReadString('\n')
+ osi.otaLoaded = strings.TrimSpace(osi.otaLoaded)
+ }
+ if osi.phpSessID == "" {
+ fmt.Print("PHPSESSID: ")
+ osi.phpSessID, _ = r.ReadString('\n')
+ osi.phpSessID = strings.TrimSpace(osi.phpSessID)
+ }
+ }
+}
+
+// ---- main scrape loop -----------------------------------------------
+func (os *OikotieScraper) ScrapeAll(ctx context.Context, couch *CouchClient) error {
+ os.loadTokens()
+
+ limit := 24
+ offset := 0
+ totalSaved := 0
+
+ for {
+ select {
+ case <-ctx.Done():
+ return ctx.Err()
+ case <-os.rateLimiter:
+ cards, found, err := os.fetchPage(offset, limit)
+ if err != nil {
+ if strings.Contains(err.Error(), "401") {
+ log.Println("401 – re-entering tokens")
+ os.loadTokens()
+ continue
+ }
+ log.Printf("fetch error (offset %d): %v", offset, err)
+ time.Sleep(5 * time.Second)
+ continue
+ }
+
+ if len(cards) == 0 {
+ log.Printf("No cards at offset %d – finished", offset)
+ break
+ }
+
+ for _, c := range cards {
+ h, err := os.convertCard(c)
+ if err != nil {
+ log.Printf("convert error %s: %v", c.ID, err)
+ continue
+ }
+ if err := couch.Upsert(h); err != nil {
+ log.Printf("couch upsert %s: %v", h.ID, err)
+ } else {
+ totalSaved++
+ }
+ }
+
+ log.Printf("offset %d-%d → %d new (total %d/%d)", offset, offset+len(cards)-1, len(cards), totalSaved, found)
+
+ if offset+len(cards) >= found {
+ log.Printf("Reached end – %d cards saved", totalSaved)
+ break
+ }
+ offset += limit
+ }
+ }
+ return nil
+}
+
+// ---- API call --------------------------------------------------------
+type apiCard struct {
+ ID string `json:"cardId"`
+ Type int `json:"cardType"`
+ SubType int `json:"cardSubType"`
+ URL string `json:"url"`
+ Status int `json:"status"`
+ Data json.RawMessage
+ Location json.RawMessage
+ Company json.RawMessage
+ Medias []struct {
+ ImageMobileWebPx2 string `json:"imageMobileWebPx2"`
+ } `json:"medias"`
+}
+
+func (os *OikotieScraper) fetchPage(offset, limit int) ([]apiCard, int, error) {
+ q := url.Values{}
+ q.Add("locations", `[[64,6,"Helsinki"]]`)
+ for _, bt := range []string{"4", "8", "32", "128", "64", "512"} {
+ q.Add("buildingType[]", bt)
+ }
+ q.Add("cardType", "100")
+ q.Add("limit", strconv.Itoa(limit))
+ q.Add("offset", strconv.Itoa(offset))
+ q.Add("sortBy", "published_sort_desc")
+
+ reqURL := os.baseURL + "?" + q.Encode()
+ req, _ := http.NewRequest("GET", reqURL, nil)
+
+ // ---- headers ----------------------------------------------------
+ req.Header.Set("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:144.0) Gecko/20100101 Firefox/144.0")
+ req.Header.Set("Accept", "application/json, text/plain, */*")
+ req.Header.Set("Referer", "https://asunnot.oikotie.fi/myytavat-asunnot?pagination=1&locations=%5B%5B64,6,%22Helsinki%22%5D%5D&cardType=100&buildingType%5B%5D=4&buildingType%5B%5D=8&buildingType%5B%5D=32&buildingType%5B%5D=128&buildingType%5B%5D=64&buildingType%5B%5D=512")
+ req.Header.Set("OTA-token", os.otaToken)
+ req.Header.Set("OTA-cuid", os.otaCuid)
+ req.Header.Set("OTA-loaded", os.otaLoaded)
+ req.Header.Set("Cookie", fmt.Sprintf("PHPSESSID=%s; user_id=%s; cardType=100", os.phpSessID, os.otaCuid))
+
+ resp, err := os.client.Do(req)
+ if err != nil {
+ return nil, 0, err
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode == 401 {
+ return nil, 0, fmt.Errorf("401 Unauthorized")
+ }
+ if resp.StatusCode != 200 {
+ b, _ := io.ReadAll(resp.Body)
+ return nil, 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(b))
+ }
+
+ var payload struct {
+ Found int `json:"found"`
+ Cards []apiCard `json:"cards"`
+ }
+ if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
+ return nil, 0, err
+ }
+ return payload.Cards, payload.Found, nil
+}
+
+// ---- conversion -------------------------------------------------------
+func (os *OikotieScraper) convertCard(c apiCard) (*House, error) {
+ h := &House{
+ ID: "oikotie_" + c.ID,
+ Source: "oikotie",
+ URL: "https://asunnot.oikotie.fi" + c.URL,
+ Status: c.Status,
+ Type: c.Type,
+ SubType: c.SubType,
+ ScrapedAt: time.Now(),
+ Raw: map[string]json.RawMessage{
+ "data": c.Data,
+ "location": c.Location,
+ "company": c.Company,
+ },
+ }
+
+ // ---- images → download → S3 → store public URL --------------------
+ for i, m := range c.Medias {
+ if m.ImageMobileWebPx2 == "" {
+ continue
+ }
+ key := fmt.Sprintf("%s/img_%d.webp", h.ID, i)
+ publicURL, err := os.s3.UploadFromURL(m.ImageMobileWebPx2, key)
+ if err != nil {
+ log.Printf("image upload failed %s: %v", key, err)
+ continue
+ }
+ h.Images = append(h.Images, publicURL)
+ }
+ return h, nil
+}
+
+// ---------------------------------------------------------------------
+// 6. MAIN
+// ---------------------------------------------------------------------
+
+func main() {
+ cfg := Config{
+ CouchURL: getEnv("COUCHDB_URL", "https://couch.tammi.cc"),
+ CouchDB: getEnv("COUCHDB_DATABASE", "asunnot"),
+ S3Endpoint: getEnv("S3_ENDPOINT", "s3.tammi.cc"),
+ S3Bucket: getEnv("S3_BUCKET", "asunnot"),
+ S3UseSSL: getEnvBool("S3_USE_SSL", true),
+ }
+
+ s3, err := NewS3Client(cfg.S3Endpoint, cfg.S3Bucket, cfg.S3UseSSL)
+ if err != nil {
+ log.Fatal("S3 init:", err)
+ }
+
+ couch := NewCouchClient(cfg.CouchURL, cfg.CouchDB)
+
+ scraper := NewOikotieScraper(s3)
+
+ ctx, cancel := context.WithCancel(context.Background())
+ defer cancel()
+
+ log.Println("Starting full Oikotie scrape …")
+ if err := scraper.ScrapeAll(ctx, couch); err != nil {
+ log.Fatal("scrape failed:", err)
+ }
+ log.Println("All done!")
+}
diff --git a/scrape/requirements.tsv b/scrape/requirements.tsv
new file mode 100644
index 0000000..a9b5ecf
--- /dev/null
+++ b/scrape/requirements.tsv
@@ -0,0 +1,15 @@
+ID Category Requirement
+BACK-1 Scraping Download data fom oikotie API
+BACK-3 Scraping Rate limiting (3 seconds between requests)
+BACK-4 Scraping No external dependencies, except minio
+BACK-5 Image Processing Use webp images with high resolution
+BACK-6 Image Processing Store images in MinIO S3 (s3.tammi.cc)
+BACK-7 Storage CouchDB for house data
+BACK-8 Storage Proper connection handling and error recovery
+BACK-9 Data Management Detect new, updated, and removed houses
+BACK-10 Data Management Track house timeline (appearance/disappearance)
+BACK-11 Notifications ntfy.sh integration for new houses
+BACK-12 Configuration Environment-based configuration
+BACK-13 Operations Systemd service and timer for daily runs
+BACK-15 Error Handling Comprehensive error logging and retries
+BACK-19 Data Retention Keep data indefinitely