aboutsummaryrefslogtreecommitdiffstats
path: root/scrape/html.go
diff options
context:
space:
mode:
Diffstat (limited to 'scrape/html.go')
-rw-r--r--scrape/html.go151
1 files changed, 151 insertions, 0 deletions
diff --git a/scrape/html.go b/scrape/html.go
new file mode 100644
index 0000000..a1d1bbe
--- /dev/null
+++ b/scrape/html.go
@@ -0,0 +1,151 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "log"
+ "os"
+ "strings"
+
+ "golang.org/x/net/html"
+)
+
+type PropertyField struct {
+ Title string `json:"title"`
+ Value string `json:"value"`
+}
+
+type PropertySection struct {
+ Section string `json:"section"`
+ Fields []PropertyField `json:"fields"`
+}
+
+func test() {
+ // Read the HTML file
+ htmlContent, err := os.ReadFile("oikotie.html")
+ if err != nil {
+ log.Fatal("Error reading file:", err)
+ }
+
+ // Parse the HTML
+ sections, err := parsePropertyHTML(string(htmlContent))
+ if err != nil {
+ log.Fatal("Error parsing HTML:", err)
+ }
+
+ // Convert to JSON
+ jsonData, err := json.MarshalIndent(sections, "", " ")
+ if err != nil {
+ log.Fatal("Error marshaling JSON:", err)
+ }
+
+ // Write to file
+ err = os.WriteFile("property_data.json", jsonData, 0644)
+ if err != nil {
+ log.Fatal("Error writing JSON file:", err)
+ }
+
+ fmt.Println("Successfully parsed property data and saved to property_data.json")
+}
+
+func parsePropertyHTML(htmlContent string) ([]PropertySection, error) {
+ doc, err := html.Parse(strings.NewReader(htmlContent))
+ if err != nil {
+ return nil, err
+ }
+
+ var sections []PropertySection
+ var currentSection *PropertySection
+
+ // Recursive function to traverse the HTML nodes
+ var traverse func(*html.Node)
+ traverse = func(n *html.Node) {
+ if n.Type == html.ElementNode {
+ // Check for section headers
+ if n.Data == "h3" && hasClass(n, "heading") && hasClass(n, "heading--title-2") {
+ if currentSection != nil && len(currentSection.Fields) > 0 {
+ sections = append(sections, *currentSection)
+ }
+
+ sectionName := extractText(n)
+ currentSection = &PropertySection{
+ Section: sectionName,
+ Fields: []PropertyField{},
+ }
+ }
+
+ // Check for info table rows
+ if n.Data == "div" && hasClass(n, "info-table__row") {
+ if currentSection != nil {
+ field := parseInfoTableRow(n)
+ if field.Title != "" {
+ currentSection.Fields = append(currentSection.Fields, field)
+ }
+ }
+ }
+ }
+
+ // Traverse child nodes
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
+ traverse(c)
+ }
+ }
+
+ traverse(doc)
+
+ // Don't forget to add the last section
+ if currentSection != nil && len(currentSection.Fields) > 0 {
+ sections = append(sections, *currentSection)
+ }
+
+ return sections, nil
+}
+
+func parseInfoTableRow(n *html.Node) PropertyField {
+ var field PropertyField
+
+ var traverseRow func(*html.Node)
+ traverseRow = func(node *html.Node) {
+ if node.Type == html.ElementNode {
+ if node.Data == "dt" && hasClass(node, "info-table__title") {
+ field.Title = extractText(node)
+ }
+ if node.Data == "dd" && hasClass(node, "info-table__value") {
+ field.Value = extractText(node)
+ }
+ }
+
+ for c := node.FirstChild; c != nil; c = c.NextSibling {
+ traverseRow(c)
+ }
+ }
+
+ traverseRow(n)
+ return field
+}
+
+func hasClass(n *html.Node, className string) bool {
+ for _, attr := range n.Attr {
+ if attr.Key == "class" && strings.Contains(attr.Val, className) {
+ return true
+ }
+ }
+ return false
+}
+
+func extractText(n *html.Node) string {
+ var text strings.Builder
+
+ var extract func(*html.Node)
+ extract = func(node *html.Node) {
+ if node.Type == html.TextNode {
+ text.WriteString(node.Data)
+ }
+ for c := node.FirstChild; c != nil; c = c.NextSibling {
+ extract(c)
+ }
+ }
+
+ extract(n)
+ return strings.TrimSpace(text.String())
+}