This commit is contained in:
Peter Kleiweg
2026-03-26 08:45:25 +01:00
parent 7a8b0870e2
commit 010bc3ca59
4 changed files with 70 additions and 0 deletions

1
.gitignore vendored
View File

@@ -26,6 +26,7 @@ VRT/metadata
VRT/vrt
bin/ISOWeek
bin/score
bin/top20
bin/week2files
20??
corpus

View File

@@ -12,6 +12,7 @@ all:
make -C VRT
make bin/ISOWeek
make bin/score
make bin/top20
make bin/week2files
bin/ISOWeek: cmd/ISOWeek/*.go
@@ -20,6 +21,9 @@ bin/ISOWeek: cmd/ISOWeek/*.go
bin/score: cmd/score/*.go
go build -o $@ $^
bin/top20: cmd/top20/*.go
go build -o $@ $^
bin/week2files: cmd/week2files/*.go
go build -o $@ $^

62
cmd/top20/top20.go Normal file
View File

@@ -0,0 +1,62 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"os"
"regexp"
"strings"
)
var (
x = e.ExitErr
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
seen = make(map[string]bool)
)
func main() {
m := reFile.FindStringSubmatch(os.Args[1])
infile := m[0]
prefix := m[1]
suffix := m[3] + ".t20"
target := infile + ".t20"
x(os.Chdir("/net/corpora/nlnieuws/data"))
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
name := file.Name()
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
fp, err := os.Open(name)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true
}
x(scanner.Err())
x(fp.Close())
}
}
fpin, err := os.Open(infile)
x(err)
fpout, err := os.Create(target)
x(err)
scanner := bufio.NewScanner(fpin)
n := 0
for scanner.Scan() && n < 20 {
line := scanner.Text()
w := strings.SplitN(line, "\t", 2)[1]
if seen[w] {
continue
}
n++
x(fpout.WriteString(line + "\n"))
}
x(scanner.Err())
x(fpout.Close())
x(fpin.Close())
}

View File

@@ -33,6 +33,7 @@ do
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i
top20 nieuw-namen-$ds-$i
# score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
alto \
@@ -41,6 +42,7 @@ do
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i
top20 nieuw-woorden-$ds-$i
# score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
alto \
@@ -49,6 +51,7 @@ do
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-extra-$ds-$i
top20 nieuw-extra-$ds-$i
# score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
alto \