package main import ( "encoding/json" "fmt" "log" "os" "strings" "golang.org/x/net/html" ) type PropertyField struct { Title string `json:"title"` Value string `json:"value"` } type PropertySection struct { Section string `json:"section"` Fields []PropertyField `json:"fields"` } func test() { // Read the HTML file htmlContent, err := os.ReadFile("oikotie.html") if err != nil { log.Fatal("Error reading file:", err) } // Parse the HTML sections, err := parsePropertyHTML(string(htmlContent)) if err != nil { log.Fatal("Error parsing HTML:", err) } // Convert to JSON jsonData, err := json.MarshalIndent(sections, "", " ") if err != nil { log.Fatal("Error marshaling JSON:", err) } // Write to file err = os.WriteFile("property_data.json", jsonData, 0644) if err != nil { log.Fatal("Error writing JSON file:", err) } fmt.Println("Successfully parsed property data and saved to property_data.json") } func parsePropertyHTML(htmlContent string) ([]PropertySection, error) { doc, err := html.Parse(strings.NewReader(htmlContent)) if err != nil { return nil, err } var sections []PropertySection var currentSection *PropertySection // Recursive function to traverse the HTML nodes var traverse func(*html.Node) traverse = func(n *html.Node) { if n.Type == html.ElementNode { // Check for section headers if n.Data == "h3" && hasClass(n, "heading") && hasClass(n, "heading--title-2") { if currentSection != nil && len(currentSection.Fields) > 0 { sections = append(sections, *currentSection) } sectionName := extractText(n) currentSection = &PropertySection{ Section: sectionName, Fields: []PropertyField{}, } } // Check for info table rows if n.Data == "div" && hasClass(n, "info-table__row") { if currentSection != nil { field := parseInfoTableRow(n) if field.Title != "" { currentSection.Fields = append(currentSection.Fields, field) } } } } // Traverse child nodes for c := n.FirstChild; c != nil; c = c.NextSibling { traverse(c) } } traverse(doc) // Don't forget to add the last section if currentSection != nil && len(currentSection.Fields) > 0 { sections = append(sections, *currentSection) } return sections, nil } func parseInfoTableRow(n *html.Node) PropertyField { var field PropertyField var traverseRow func(*html.Node) traverseRow = func(node *html.Node) { if node.Type == html.ElementNode { if node.Data == "dt" && hasClass(node, "info-table__title") { field.Title = extractText(node) } if node.Data == "dd" && hasClass(node, "info-table__value") { field.Value = extractText(node) } } for c := node.FirstChild; c != nil; c = c.NextSibling { traverseRow(c) } } traverseRow(n) return field } func hasClass(n *html.Node, className string) bool { for _, attr := range n.Attr { if attr.Key == "class" && strings.Contains(attr.Val, className) { return true } } return false } func extractText(n *html.Node) string { var text strings.Builder var extract func(*html.Node) extract = func(node *html.Node) { if node.Type == html.TextNode { text.WriteString(node.Data) } for c := node.FirstChild; c != nil; c = c.NextSibling { extract(c) } } extract(n) return strings.TrimSpace(text.String()) }