package main /* Dit past corpora aan Tags verwijderen: Oog: Nieuws Parool: Nieuws RO: Artikelen, cafeyn RTVNoord: br_* Tzum: Nieuws Tags veranderen: RTVNoord: tr_* → * */ import ( e "codeberg.org/pebbe/errors" cc "github.com/pebbe/compactcorpus" "github.com/rug-compling/alpinods" "encoding/xml" "fmt" "os" "strings" ) var ( x = e.ExitErr ) func main() { for _, file := range os.Args[1:] { base := strings.TrimSuffix(file, ".data.dz") newfile := base + "-new.data.dz" incc, err := cc.Open(file) x(err) outcc, err := cc.NewCorpus(newfile) x(err) r, err := incc.NewRange() x(err) for r.HasNext() { name, data := r.Next() fmt.Printf("%s %s \r", base, name) var alpino alpinods.AlpinoDS x(xml.Unmarshal(data, &alpino)) for i := 0; i < len(alpino.Metadata.Meta); i++ { if alpino.Metadata.Meta[i].Name != "tag" { continue } if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") { alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...) i-- } else if strings.HasPrefix(n, "tr_") { alpino.Metadata.Meta[i].Value = n[3:] } } outcc.Write(name, []byte(alpino.String())) } x(outcc.Close()) } }