Top20
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -26,6 +26,7 @@ VRT/metadata
|
|||||||
VRT/vrt
|
VRT/vrt
|
||||||
bin/ISOWeek
|
bin/ISOWeek
|
||||||
bin/score
|
bin/score
|
||||||
|
bin/top20
|
||||||
bin/week2files
|
bin/week2files
|
||||||
20??
|
20??
|
||||||
corpus
|
corpus
|
||||||
|
|||||||
4
Makefile
4
Makefile
@@ -12,6 +12,7 @@ all:
|
|||||||
make -C VRT
|
make -C VRT
|
||||||
make bin/ISOWeek
|
make bin/ISOWeek
|
||||||
make bin/score
|
make bin/score
|
||||||
|
make bin/top20
|
||||||
make bin/week2files
|
make bin/week2files
|
||||||
|
|
||||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||||
@@ -20,6 +21,9 @@ bin/ISOWeek: cmd/ISOWeek/*.go
|
|||||||
bin/score: cmd/score/*.go
|
bin/score: cmd/score/*.go
|
||||||
go build -o $@ $^
|
go build -o $@ $^
|
||||||
|
|
||||||
|
bin/top20: cmd/top20/*.go
|
||||||
|
go build -o $@ $^
|
||||||
|
|
||||||
bin/week2files: cmd/week2files/*.go
|
bin/week2files: cmd/week2files/*.go
|
||||||
go build -o $@ $^
|
go build -o $@ $^
|
||||||
|
|
||||||
|
|||||||
62
cmd/top20/top20.go
Normal file
62
cmd/top20/top20.go
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
e "codeberg.org/pebbe/errors"
|
||||||
|
|
||||||
|
"bufio"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
x = e.ExitErr
|
||||||
|
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||||
|
seen = make(map[string]bool)
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
m := reFile.FindStringSubmatch(os.Args[1])
|
||||||
|
infile := m[0]
|
||||||
|
prefix := m[1]
|
||||||
|
suffix := m[3] + ".t20"
|
||||||
|
target := infile + ".t20"
|
||||||
|
|
||||||
|
x(os.Chdir("/net/corpora/nlnieuws/data"))
|
||||||
|
|
||||||
|
files, err := os.ReadDir(".")
|
||||||
|
x(err)
|
||||||
|
for _, file := range files {
|
||||||
|
name := file.Name()
|
||||||
|
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
|
||||||
|
fp, err := os.Open(name)
|
||||||
|
x(err)
|
||||||
|
scanner := bufio.NewScanner(fp)
|
||||||
|
for scanner.Scan() {
|
||||||
|
seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true
|
||||||
|
}
|
||||||
|
x(scanner.Err())
|
||||||
|
x(fp.Close())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fpin, err := os.Open(infile)
|
||||||
|
x(err)
|
||||||
|
fpout, err := os.Create(target)
|
||||||
|
x(err)
|
||||||
|
scanner := bufio.NewScanner(fpin)
|
||||||
|
n := 0
|
||||||
|
for scanner.Scan() && n < 20 {
|
||||||
|
line := scanner.Text()
|
||||||
|
w := strings.SplitN(line, "\t", 2)[1]
|
||||||
|
if seen[w] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n++
|
||||||
|
x(fpout.WriteString(line + "\n"))
|
||||||
|
}
|
||||||
|
x(scanner.Err())
|
||||||
|
x(fpout.Close())
|
||||||
|
x(fpin.Close())
|
||||||
|
}
|
||||||
@@ -33,6 +33,7 @@ do
|
|||||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
||||||
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i
|
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-namen-$ds-$i
|
||||||
|
|
||||||
|
top20 nieuw-namen-$ds-$i
|
||||||
# score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
|
# score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
@@ -41,6 +42,7 @@ do
|
|||||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
||||||
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i
|
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-woorden-$ds-$i
|
||||||
|
|
||||||
|
top20 nieuw-woorden-$ds-$i
|
||||||
# score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
|
# score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
@@ -49,6 +51,7 @@ do
|
|||||||
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | \
|
||||||
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-extra-$ds-$i
|
sort -f -k 2 | sort -n -r -k 1,1 -s > nieuw-extra-$ds-$i
|
||||||
|
|
||||||
|
top20 nieuw-extra-$ds-$i
|
||||||
# score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
|
# score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
|
|||||||
Reference in New Issue
Block a user