Files
nlnieuws/Tzum/cmd/xml2txt/xml2txt.go
Peter Kleiweg 5c651387af grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
2026-05-27 22:42:03 +02:00

79 lines
1.6 KiB
Go

package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/xml"
"fmt"
"os"
"regexp"
"strings"
"time"
)
type Item struct {
Title string `xml:"title"`
Text string `xml:"encoded"`
Cats []string `xml:"category"`
}
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
)
func main() {
var ds string
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d.%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy.ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
x(os.MkdirAll("out", 0777))
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
filename := file.Name()
if !strings.HasSuffix(filename, ".xml") {
continue
}
b, err := os.ReadFile(filename)
x(err)
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err)
var item Item
x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err)
root := doc.Root()
pp, err := root.Search(`//body/p`)
x(err)
for _, p := range pp {
s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") {
x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
}
}
x(fp.Close())
}
}