RO/cmd/xml2txt/xml2txt.go: geen Engelstalige teksten
This commit is contained in:
@@ -3,7 +3,9 @@ package main
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
"github.com/jbowtie/gokogiri"
|
||||
"github.com/pebbe/textcat/v2"
|
||||
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os"
|
||||
@@ -20,12 +22,16 @@ type Item struct {
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
w = e.WarnErr
|
||||
|
||||
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
tc := textcat.NewTextCat()
|
||||
tc.EnableLanguages("en.utf8", "nl.utf8")
|
||||
|
||||
var ds string
|
||||
switch len(os.Args) {
|
||||
case 1:
|
||||
@@ -52,14 +58,10 @@ func main() {
|
||||
}
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
var buf bytes.Buffer
|
||||
var item Item
|
||||
x(xml.Unmarshal(b, &item))
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
|
||||
}
|
||||
x(fp.WriteString(addEnd(fixSpace(item.Title))))
|
||||
x(buf.WriteString(addEnd(fixSpace(item.Title))))
|
||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||
x(err)
|
||||
root := doc.Root()
|
||||
@@ -71,8 +73,26 @@ func main() {
|
||||
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
||||
x(err)
|
||||
for _, p := range pp {
|
||||
x(fp.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
x(buf.WriteString(addEnd(fixSpace(p.Content()))))
|
||||
}
|
||||
|
||||
text := buf.String()
|
||||
langs, err := tc.Classify(text)
|
||||
if err != nil {
|
||||
_ = w(fmt.Errorf("language: %v in %s", err, filename))
|
||||
continue
|
||||
}
|
||||
if len(langs) != 1 || langs[0] != "nl.utf8" {
|
||||
_ = w(fmt.Errorf("language: %v in %s", langs, filename))
|
||||
continue
|
||||
}
|
||||
|
||||
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
|
||||
x(err)
|
||||
for _, cat := range item.Cats {
|
||||
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
|
||||
}
|
||||
x(fp.WriteString(text))
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
7
go.mod
7
go.mod
@@ -4,4 +4,9 @@ go 1.26.1
|
||||
|
||||
require github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
|
||||
|
||||
require codeberg.org/pebbe/errors v0.4.0
|
||||
require (
|
||||
codeberg.org/pebbe/errors v0.4.0
|
||||
github.com/pebbe/textcat/v2 v2.3.0
|
||||
)
|
||||
|
||||
require github.com/pebbe/util v0.9.0 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@@ -2,3 +2,7 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
|
||||
codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
|
||||
github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
|
||||
github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
|
||||
github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
|
||||
github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
|
||||
github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
|
||||
|
||||
Reference in New Issue
Block a user