diff options
| author | Petri Hienonen <petri.hienonen@gmail.com> | 2025-11-14 11:47:49 +0200 |
|---|---|---|
| committer | Petri Hienonen <petri.hienonen@gmail.com> | 2025-11-14 11:47:49 +0200 |
| commit | d41ac3c094f733a8038885de3400ed7558b2b878 (patch) | |
| tree | a9a7cd54900e0b0c66f3293f4ff6bc6ad5cbbec6 /scrape/html.go | |
| parent | 6ca89c37f84c6b1d63c869e6471d3570d51f63be (diff) | |
| download | housing-d41ac3c094f733a8038885de3400ed7558b2b878.tar.zst | |
Minor tuning
Diffstat (limited to 'scrape/html.go')
| -rw-r--r-- | scrape/html.go | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/scrape/html.go b/scrape/html.go new file mode 100644 index 0000000..a1d1bbe --- /dev/null +++ b/scrape/html.go @@ -0,0 +1,151 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + "os" + "strings" + + "golang.org/x/net/html" +) + +type PropertyField struct { + Title string `json:"title"` + Value string `json:"value"` +} + +type PropertySection struct { + Section string `json:"section"` + Fields []PropertyField `json:"fields"` +} + +func test() { + // Read the HTML file + htmlContent, err := os.ReadFile("oikotie.html") + if err != nil { + log.Fatal("Error reading file:", err) + } + + // Parse the HTML + sections, err := parsePropertyHTML(string(htmlContent)) + if err != nil { + log.Fatal("Error parsing HTML:", err) + } + + // Convert to JSON + jsonData, err := json.MarshalIndent(sections, "", " ") + if err != nil { + log.Fatal("Error marshaling JSON:", err) + } + + // Write to file + err = os.WriteFile("property_data.json", jsonData, 0644) + if err != nil { + log.Fatal("Error writing JSON file:", err) + } + + fmt.Println("Successfully parsed property data and saved to property_data.json") +} + +func parsePropertyHTML(htmlContent string) ([]PropertySection, error) { + doc, err := html.Parse(strings.NewReader(htmlContent)) + if err != nil { + return nil, err + } + + var sections []PropertySection + var currentSection *PropertySection + + // Recursive function to traverse the HTML nodes + var traverse func(*html.Node) + traverse = func(n *html.Node) { + if n.Type == html.ElementNode { + // Check for section headers + if n.Data == "h3" && hasClass(n, "heading") && hasClass(n, "heading--title-2") { + if currentSection != nil && len(currentSection.Fields) > 0 { + sections = append(sections, *currentSection) + } + + sectionName := extractText(n) + currentSection = &PropertySection{ + Section: sectionName, + Fields: []PropertyField{}, + } + } + + // Check for info table rows + if n.Data == "div" && hasClass(n, "info-table__row") { + if currentSection != nil { + field := parseInfoTableRow(n) + if field.Title != "" { + currentSection.Fields = append(currentSection.Fields, field) + } + } + } + } + + // Traverse child nodes + for c := n.FirstChild; c != nil; c = c.NextSibling { + traverse(c) + } + } + + traverse(doc) + + // Don't forget to add the last section + if currentSection != nil && len(currentSection.Fields) > 0 { + sections = append(sections, *currentSection) + } + + return sections, nil +} + +func parseInfoTableRow(n *html.Node) PropertyField { + var field PropertyField + + var traverseRow func(*html.Node) + traverseRow = func(node *html.Node) { + if node.Type == html.ElementNode { + if node.Data == "dt" && hasClass(node, "info-table__title") { + field.Title = extractText(node) + } + if node.Data == "dd" && hasClass(node, "info-table__value") { + field.Value = extractText(node) + } + } + + for c := node.FirstChild; c != nil; c = c.NextSibling { + traverseRow(c) + } + } + + traverseRow(n) + return field +} + +func hasClass(n *html.Node, className string) bool { + for _, attr := range n.Attr { + if attr.Key == "class" && strings.Contains(attr.Val, className) { + return true + } + } + return false +} + +func extractText(n *html.Node) string { + var text strings.Builder + + var extract func(*html.Node) + extract = func(node *html.Node) { + if node.Type == html.TextNode { + text.WriteString(node.Data) + } + for c := node.FirstChild; c != nil; c = c.NextSibling { + extract(c) + } + } + + extract(n) + return strings.TrimSpace(text.String()) +} |
