RO/cmd/xml2txt/xml2txt.go: geen Engelstalige teksten

This commit is contained in:
Peter Kleiweg
2026-03-25 17:48:57 +01:00
parent f2d240f2cb
commit 65b36ceec4
3 changed files with 37 additions and 8 deletions

View File

@@ -3,7 +3,9 @@ package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
"github.com/pebbe/textcat/v2"
"bytes"
"encoding/xml"
"fmt"
"os"
@@ -20,12 +22,16 @@ type Item struct {
var (
x = e.ExitErr
w = e.WarnErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
)
func main() {
tc := textcat.NewTextCat()
tc.EnableLanguages("en.utf8", "nl.utf8")
var ds string
switch len(os.Args) {
case 1:
@@ -52,14 +58,10 @@ func main() {
}
b, err := os.ReadFile(filename)
x(err)
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err)
var buf bytes.Buffer
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
}
x(fp.WriteString(addEnd(fixSpace(item.Title))))
x(buf.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err)
root := doc.Root()
@@ -71,8 +73,26 @@ func main() {
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
x(err)
for _, p := range pp {
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
}
text := buf.String()
langs, err := tc.Classify(text)
if err != nil {
_ = w(fmt.Errorf("language: %v in %s", err, filename))
continue
}
if len(langs) != 1 || langs[0] != "nl.utf8" {
_ = w(fmt.Errorf("language: %v in %s", langs, filename))
continue
}
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
}
x(fp.WriteString(text))
x(fp.Close())
}
}