Top20

2026-03-26 08:45:25 +01:00
parent 7a8b0870e2
commit 010bc3ca59
4 changed files with 70 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ VRT/metadata
 VRT/vrt
 bin/ISOWeek
 bin/score
+bin/top20
 bin/week2files
 20??
 corpus
--- a/4
+++ b/4
@@ -12,6 +12,7 @@ all:
 	make -C VRT
 	make bin/ISOWeek
 	make bin/score
+	make bin/top20
 	make bin/week2files

 bin/ISOWeek: cmd/ISOWeek/*.go
@@ -20,6 +21,9 @@ bin/ISOWeek: cmd/ISOWeek/*.go
 bin/score: cmd/score/*.go
 	go build -o $@ $^

+bin/top20: cmd/top20/*.go
+	go build -o $@ $^
+
 bin/week2files: cmd/week2files/*.go
 	go build -o $@ $^

--- a/cmd/top20/top20.go
+++ b/cmd/top20/top20.go
@@ -0,0 +1,62 @@
+package main
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"bufio"
+	"os"
+	"regexp"
+	"strings"
+)
+
+var (
+	x      = e.ExitErr
+	reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
+	seen   = make(map[string]bool)
+)
+
+func main() {
+
+	m := reFile.FindStringSubmatch(os.Args[1])
+	infile := m[0]
+	prefix := m[1]
+	suffix := m[3] + ".t20"
+	target := infile + ".t20"
+
+	x(os.Chdir("/net/corpora/nlnieuws/data"))
+
+	files, err := os.ReadDir(".")
+	x(err)
+	for _, file := range files {
+		name := file.Name()
+		if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
+			fp, err := os.Open(name)
+			x(err)
+			scanner := bufio.NewScanner(fp)
+			for scanner.Scan() {
+				seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true
+			}
+			x(scanner.Err())
+			x(fp.Close())
+		}
+	}
+
+	fpin, err := os.Open(infile)
+	x(err)
+	fpout, err := os.Create(target)
+	x(err)
+	scanner := bufio.NewScanner(fpin)
+	n := 0
+	for scanner.Scan() && n < 20 {
+		line := scanner.Text()
+		w := strings.SplitN(line, "\t", 2)[1]
+		if seen[w] {
+			continue
+		}
+		n++
+		x(fpout.WriteString(line + "\n"))
+	}
+	x(scanner.Err())
+	x(fpout.Close())
+	x(fpin.Close())
+}
--- a/collect.sh
+++ b/collect.sh
@@ -33,6 +33,7 @@ do
        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | \
        sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i

+    top20 nieuw-namen-$ds-$i
    # score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score

    alto \
@@ -41,6 +42,7 @@ do
        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | \
        sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i

+    top20 nieuw-woorden-$ds-$i
    # score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score

    alto \
@@ -49,6 +51,7 @@ do
        'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | \
        sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-extra-$ds-$i

+    top20 nieuw-extra-$ds-$i
    # score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score

    alto \