From 010bc3ca59c33ad5425d457a72dcea626c8c41bf Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Thu, 26 Mar 2026 08:45:25 +0100 Subject: [PATCH] Top20 --- .gitignore | 1 + Makefile | 4 +++ cmd/top20/top20.go | 62 ++++++++++++++++++++++++++++++++++++++++++++++ collect.sh | 3 +++ 4 files changed, 70 insertions(+) create mode 100644 cmd/top20/top20.go diff --git a/.gitignore b/.gitignore index b16dbbc..c16d56b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ VRT/metadata VRT/vrt bin/ISOWeek bin/score +bin/top20 bin/week2files 20?? corpus diff --git a/Makefile b/Makefile index fa34931..92a64b8 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ all: make -C VRT make bin/ISOWeek make bin/score + make bin/top20 make bin/week2files bin/ISOWeek: cmd/ISOWeek/*.go @@ -20,6 +21,9 @@ bin/ISOWeek: cmd/ISOWeek/*.go bin/score: cmd/score/*.go go build -o $@ $^ +bin/top20: cmd/top20/*.go + go build -o $@ $^ + bin/week2files: cmd/week2files/*.go go build -o $@ $^ diff --git a/cmd/top20/top20.go b/cmd/top20/top20.go new file mode 100644 index 0000000..7b81202 --- /dev/null +++ b/cmd/top20/top20.go @@ -0,0 +1,62 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "os" + "regexp" + "strings" +) + +var ( + x = e.ExitErr + reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`) + seen = make(map[string]bool) +) + +func main() { + + m := reFile.FindStringSubmatch(os.Args[1]) + infile := m[0] + prefix := m[1] + suffix := m[3] + ".t20" + target := infile + ".t20" + + x(os.Chdir("/net/corpora/nlnieuws/data")) + + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + name := file.Name() + if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target { + fp, err := os.Open(name) + x(err) + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true + } + x(scanner.Err()) + x(fp.Close()) + } + } + + fpin, err := os.Open(infile) + x(err) + fpout, err := os.Create(target) + x(err) + scanner := bufio.NewScanner(fpin) + n := 0 + for scanner.Scan() && n < 20 { + line := scanner.Text() + w := strings.SplitN(line, "\t", 2)[1] + if seen[w] { + continue + } + n++ + x(fpout.WriteString(line + "\n")) + } + x(scanner.Err()) + x(fpout.Close()) + x(fpin.Close()) +} diff --git a/collect.sh b/collect.sh index fee9064..8c87e63 100755 --- a/collect.sh +++ b/collect.sh @@ -33,6 +33,7 @@ do tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i + top20 nieuw-namen-$ds-$i # score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score alto \ @@ -41,6 +42,7 @@ do tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i + top20 nieuw-woorden-$ds-$i # score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score alto \ @@ -49,6 +51,7 @@ do 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \ sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-extra-$ds-$i + top20 nieuw-extra-$ds-$i # score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score alto \