67 lines
1.3 KiB
Go
67 lines
1.3 KiB
Go
package main
|
|
|
|
/*
|
|
Dit past corpora aan
|
|
|
|
Tags verwijderen:
|
|
|
|
Oog: Nieuws
|
|
Parool: Nieuws
|
|
RO: Artikelen, cafeyn
|
|
RTVNoord: br_*
|
|
Tzum: Nieuws
|
|
|
|
Tags veranderen:
|
|
|
|
RTVNoord: tr_* → *
|
|
|
|
*/
|
|
|
|
import (
|
|
e "codeberg.org/pebbe/errors"
|
|
cc "github.com/pebbe/compactcorpus"
|
|
"github.com/rug-compling/alpinods"
|
|
|
|
"encoding/xml"
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
)
|
|
|
|
var (
|
|
x = e.ExitErr
|
|
)
|
|
|
|
func main() {
|
|
for _, file := range os.Args[1:] {
|
|
base := strings.TrimSuffix(file, ".data.dz")
|
|
newfile := base + "-new.data.dz"
|
|
|
|
incc, err := cc.Open(file)
|
|
x(err)
|
|
outcc, err := cc.NewCorpus(newfile)
|
|
x(err)
|
|
r, err := incc.NewRange()
|
|
x(err)
|
|
for r.HasNext() {
|
|
name, data := r.Next()
|
|
fmt.Printf("%s %s \r", base, name)
|
|
var alpino alpinods.AlpinoDS
|
|
x(xml.Unmarshal(data, &alpino))
|
|
for i := 0; i < len(alpino.Metadata.Meta); i++ {
|
|
if alpino.Metadata.Meta[i].Name != "tag" {
|
|
continue
|
|
}
|
|
if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
|
|
alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
|
|
i--
|
|
} else if strings.HasPrefix(n, "tr_") {
|
|
alpino.Metadata.Meta[i].Value = n[3:]
|
|
}
|
|
}
|
|
outcc.Write(name, []byte(alpino.String()))
|
|
}
|
|
x(outcc.Close())
|
|
}
|
|
}
|