Files
nlnieuws/oud/fix.go
2026-05-29 12:22:57 +02:00

67 lines
1.3 KiB
Go

package main
/*
Dit past corpora aan
Tags verwijderen:
Oog: Nieuws
Parool: Nieuws
RO: Artikelen, cafeyn
RTVNoord: br_*
Tzum: Nieuws
Tags veranderen:
RTVNoord: tr_* → *
*/
import (
e "codeberg.org/pebbe/errors"
cc "github.com/pebbe/compactcorpus"
"github.com/rug-compling/alpinods"
"encoding/xml"
"fmt"
"os"
"strings"
)
var (
x = e.ExitErr
)
func main() {
for _, file := range os.Args[1:] {
base := strings.TrimSuffix(file, ".data.dz")
newfile := base + "-new.data.dz"
incc, err := cc.Open(file)
x(err)
outcc, err := cc.NewCorpus(newfile)
x(err)
r, err := incc.NewRange()
x(err)
for r.HasNext() {
name, data := r.Next()
fmt.Printf("%s %s \r", base, name)
var alpino alpinods.AlpinoDS
x(xml.Unmarshal(data, &alpino))
for i := 0; i < len(alpino.Metadata.Meta); i++ {
if alpino.Metadata.Meta[i].Name != "tag" {
continue
}
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
i--
} else if strings.HasPrefix(n, "tr_") {
alpino.Metadata.Meta[i].Value = n[3:]
}
}
outcc.Write(name, []byte(alpino.String()))
}
x(outcc.Close())
}
}