diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 5d0f1fd..ad7f1a3 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -3,7 +3,9 @@ package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" + "github.com/pebbe/textcat/v2" + "bytes" "encoding/xml" "fmt" "os" @@ -20,12 +22,16 @@ type Item struct { var ( x = e.ExitErr + w = e.WarnErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) func main() { + tc := textcat.NewTextCat() + tc.EnableLanguages("en.utf8", "nl.utf8") + var ds string switch len(os.Args) { case 1: @@ -52,14 +58,10 @@ func main() { } b, err := os.ReadFile(filename) x(err) - fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") - x(err) + var buf bytes.Buffer var item Item x(xml.Unmarshal(b, &item)) - for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) - } - x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(buf.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -71,8 +73,26 @@ func main() { pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) x(err) for _, p := range pp { - x(fp.WriteString(addEnd(fixSpace(p.Content())))) + x(buf.WriteString(addEnd(fixSpace(p.Content())))) } + + text := buf.String() + langs, err := tc.Classify(text) + if err != nil { + _ = w(fmt.Errorf("language: %v in %s", err, filename)) + continue + } + if len(langs) != 1 || langs[0] != "nl.utf8" { + _ = w(fmt.Errorf("language: %v in %s", langs, filename)) + continue + } + + fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") + x(err) + for _, cat := range item.Cats { + x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + } + x(fp.WriteString(text)) x(fp.Close()) } } diff --git a/go.mod b/go.mod index 9c51388..e583a0e 100644 --- a/go.mod +++ b/go.mod @@ -4,4 +4,9 @@ go 1.26.1 require github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 -require codeberg.org/pebbe/errors v0.4.0 +require ( + codeberg.org/pebbe/errors v0.4.0 + github.com/pebbe/textcat/v2 v2.3.0 +) + +require github.com/pebbe/util v0.9.0 // indirect diff --git a/go.sum b/go.sum index 917be06..899fa8d 100644 --- a/go.sum +++ b/go.sum @@ -2,3 +2,7 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU= codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw= +github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y= +github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg= +github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA= +github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=