diff --git a/Oog/cmd/xml2txt/xml2txt.go b/Oog/cmd/xml2txt/xml2txt.go index d84e686..f7881ad 100644 --- a/Oog/cmd/xml2txt/xml2txt.go +++ b/Oog/cmd/xml2txt/xml2txt.go @@ -59,7 +59,10 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) + t := u.FixSpace(cat) + if t != "Nieuws" { + x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) + } } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go index 4f5aa9a..39475b1 100644 --- a/Parool/cmd/parool/parool.go +++ b/Parool/cmd/parool/parool.go @@ -286,7 +286,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } for _, el := range ell { s := strings.TrimSpace(el.Content()) - if s != "" { + if s != "" && s != "Nieuws" { tags = append(tags, s) } } diff --git a/README.md b/README.md index a91796d..1db09c5 100644 --- a/README.md +++ b/README.md @@ -43,21 +43,22 @@ crontab van p209327@colossus Uitvoer in `[A-Z]*/corpus/` -NieuwsNL elke dag, de rest alleen op dinsdag - crontab van p209327@colossus ``` # m h dom mon dow command +# veel data: elke dag +0 1 * * * /net/corpora/nlnieuws/HLN/txt2corpus.sh +0 1 * * * /net/corpora/nlnieuws/NOS/txt2corpus.sh +0 1 * * * /net/corpora/nlnieuws/NU/txt2corpus.sh +0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh +0 1 * * * /net/corpora/nlnieuws/VRT/txt2corpus.sh +# weinig data: alleen op dinsdag 0 1 * * 2 /net/corpora/nlnieuws/AT5/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/GG/txt2corpus.sh -0 1 * * 2 /net/corpora/nlnieuws/HLN/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/LitNL/txt2corpus.sh -0 1 * * * /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh -0 1 * * 2 /net/corpora/nlnieuws/NOS/txt2corpus.sh -0 1 * * 2 /net/corpora/nlnieuws/NU/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/Oog/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/Parool/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/RO/txt2corpus.sh @@ -65,7 +66,6 @@ crontab van p209327@colossus 0 1 * * 2 /net/corpora/nlnieuws/Sargasso/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/Sikkom/txt2corpus.sh 0 1 * * 2 /net/corpora/nlnieuws/Tzum/txt2corpus.sh -0 1 * * 2 /net/corpora/nlnieuws/VRT/txt2corpus.sh ``` ## 3. Queries uitvoeren, tellingen doen diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 7997986..ce93742 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -92,7 +92,10 @@ func main() { fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) + t := u.FixSpace(cat) + if t != "Artikelen" && t != "cafeyn" { + x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) + } } x(fp.WriteString(text)) x(fp.Close()) diff --git a/RTVNoord/cmd/rtvnoord/rtvnoord.go b/RTVNoord/cmd/rtvnoord/rtvnoord.go index 1d8935c..c20b67b 100644 --- a/RTVNoord/cmd/rtvnoord/rtvnoord.go +++ b/RTVNoord/cmd/rtvnoord/rtvnoord.go @@ -236,7 +236,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool p(fmt.Fprintln(fp, "##META text tag =")) } else { for _, tag := range doc.Tags { - p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(u.FixSpace(tag)))) + t := strings.ToLower(u.FixSpace(tag)) + if strings.HasPrefix(t, "br_") { + continue + } + if strings.HasPrefix(t, "tr_") { + t = t[3:] + } + p(fmt.Fprintf(fp, "##META text tag = %s\n", t)) } } if doc.Cat == "" { diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 0acc04f..188ea4d 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -59,7 +59,11 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) + t := u.FixSpace(cat) + if t == "Nieuws" { + continue + } + x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) diff --git a/cmd/data2json/data2json.go b/cmd/data2json/data2json.go index 10772bd..8dc6c76 100644 --- a/cmd/data2json/data2json.go +++ b/cmd/data2json/data2json.go @@ -7,6 +7,7 @@ import ( "encoding/json" "fmt" "os" + "regexp" "strconv" "strings" "time" @@ -49,13 +50,14 @@ var ( parts = map[string]struct { file string suffix string + re *regexp.Regexp }{ - "nieuwe namen": {"nieuwe-namen", ".t20"}, - "nieuwe woorden": {"nieuwe-woorden-extra", ".t20"}, - "personen": {"personen", ""}, - "andere namen": {"overige-namen", ""}, - "locaties": {"locaties", ""}, - "organisaties": {"organisaties", ""}, + "nieuwe namen": {"nieuwe-namen", ".t20", nil}, + "nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil}, + "personen": {"personen", "", nil}, + "andere namen": {"overige-namen", "", nil}, + "locaties": {"locaties", "", nil}, + "organisaties": {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)}, } maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december") @@ -142,12 +144,15 @@ func makeValues(source, part string) [][5]any { scanner := bufio.NewScanner(fp) lineno := 0 for scanner.Scan() { - lineno++ line := scanner.Text() aa := strings.Split(line, "\t") count, err := strconv.Atoi(strings.TrimSpace(aa[0])) x(err) word := aa[1] + if parts[part].re != nil && parts[part].re.MatchString(word) { + continue + } + lineno++ var tags, lemma, postag string if len(aa) > 2 { tags = aa[2] diff --git a/go.mod b/go.mod index cd67917..eadde75 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,9 @@ go 1.26.1 require ( codeberg.org/pebbe/errors v0.4.0 github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 + github.com/pebbe/compactcorpus v1.0.3 github.com/pebbe/textcat/v2 v2.3.0 + github.com/rug-compling/alpinods v1.18.1 ) require github.com/pebbe/util v0.9.0 // indirect diff --git a/go.sum b/go.sum index 899fa8d..a45ccdc 100644 --- a/go.sum +++ b/go.sum @@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU= codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw= +github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ= +github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ= github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y= github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg= github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA= github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q= +github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w= +github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q= diff --git a/internal/util/util.go b/internal/util/util.go index 98623b2..b080949 100644 --- a/internal/util/util.go +++ b/internal/util/util.go @@ -11,8 +11,10 @@ import ( ) var ( - p = e.PanicErr - reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) + p = e.PanicErr + reEOL = regexp.MustCompile(`[.!?]['"”’]?$`) + reNEOL = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}`) + reLET = regexp.MustCompile(`\p{Lu}`) ) func AddEnd(s string) string { @@ -27,7 +29,12 @@ func AddEnd(s string) string { } func FixSpace(s string) string { - return strings.Join(strings.Fields(s), " ") + s = strings.Join(strings.Fields(s), " ") + s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string { + i := reLET.FindStringIndex(s1)[0] + return s1[:i] + " " + s1[i:] + }) + return s } func MkLock(filename string) { diff --git a/oud/fix.go b/oud/fix.go new file mode 100644 index 0000000..3989a5b --- /dev/null +++ b/oud/fix.go @@ -0,0 +1,66 @@ +package main + +/* +Dit past corpora aan + +Tags verwijderen: + +Oog: Nieuws +Parool: Nieuws +RO: Artikelen, cafeyn +RTVNoord: br_* +Tzum: Nieuws + +Tags veranderen: + +RTVNoord: tr_* → * + +*/ + +import ( + e "codeberg.org/pebbe/errors" + cc "github.com/pebbe/compactcorpus" + "github.com/rug-compling/alpinods" + + "encoding/xml" + "fmt" + "os" + "strings" +) + +var ( + x = e.ExitErr +) + +func main() { + for _, file := range os.Args[1:] { + base := strings.TrimSuffix(file, ".data.dz") + newfile := base + "-new.data.dz" + + incc, err := cc.Open(file) + x(err) + outcc, err := cc.NewCorpus(newfile) + x(err) + r, err := incc.NewRange() + x(err) + for r.HasNext() { + name, data := r.Next() + fmt.Printf("%s %s \r", base, name) + var alpino alpinods.AlpinoDS + x(xml.Unmarshal(data, &alpino)) + for i := 0; i < len(alpino.Metadata.Meta); i++ { + if alpino.Metadata.Meta[i].Name != "tag" { + continue + } + if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") { + alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...) + i-- + } else if strings.HasPrefix(n, "tr_") { + alpino.Metadata.Meta[i].Value = n[3:] + } + } + outcc.Write(name, []byte(alpino.String())) + } + x(outcc.Close()) + } +} diff --git a/xquery/README b/oud/xquery/README similarity index 100% rename from xquery/README rename to oud/xquery/README diff --git a/xquery/collect.sh.oud b/oud/xquery/collect.sh.oud similarity index 100% rename from xquery/collect.sh.oud rename to oud/xquery/collect.sh.oud diff --git a/xquery/items2count.go.oud b/oud/xquery/items2count.go.oud similarity index 100% rename from xquery/items2count.go.oud rename to oud/xquery/items2count.go.oud diff --git a/xquery/locaties.xq b/oud/xquery/locaties.xq similarity index 100% rename from xquery/locaties.xq rename to oud/xquery/locaties.xq diff --git a/xquery/new2old.go b/oud/xquery/new2old.go similarity index 100% rename from xquery/new2old.go rename to oud/xquery/new2old.go diff --git a/xquery/nieuwe_namen.xq b/oud/xquery/nieuwe_namen.xq similarity index 100% rename from xquery/nieuwe_namen.xq rename to oud/xquery/nieuwe_namen.xq diff --git a/xquery/nieuwe_woorden.xq b/oud/xquery/nieuwe_woorden.xq similarity index 100% rename from xquery/nieuwe_woorden.xq rename to oud/xquery/nieuwe_woorden.xq diff --git a/xquery/organisaties.xq b/oud/xquery/organisaties.xq similarity index 100% rename from xquery/organisaties.xq rename to oud/xquery/organisaties.xq diff --git a/xquery/overige_namen.xq b/oud/xquery/overige_namen.xq similarity index 100% rename from xquery/overige_namen.xq rename to oud/xquery/overige_namen.xq diff --git a/xquery/personen.xq b/oud/xquery/personen.xq similarity index 100% rename from xquery/personen.xq rename to oud/xquery/personen.xq