score; collect.sh

This commit is contained in:
Peter Kleiweg
2026-03-19 12:37:35 +01:00
parent 01438b69ac
commit 025c134c07
5 changed files with 174 additions and 34 deletions

1
.gitignore vendored
View File

@@ -25,6 +25,7 @@ Tzum/xml2txt
VRT/metadata
VRT/vrt
bin/ISOWeek
bin/score
bin/week2files
20??
corpus

View File

@@ -11,11 +11,15 @@ all:
make -C Tzum
make -C VRT
make bin/ISOWeek
make bin/score
make bin/week2files
bin/ISOWeek: cmd/ISOWeek/*.go
go build -o $@ $^
bin/score: cmd/score/*.go
go build -o $@ $^
bin/week2files: cmd/week2files/*.go
go build -o $@ $^

112
cmd/score/score.go Normal file
View File

@@ -0,0 +1,112 @@
package main
import (
e "codeberg.org/pebbe/errors"
"bufio"
"fmt"
"os"
"regexp"
"sort"
"strconv"
"strings"
)
type Item struct {
text string
score int
isnew bool
}
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
count = make(map[string]int)
items = make([]Item, 0)
)
func main() {
filename := os.Args[1]
prevname := getPrev(filename)
fp, err := os.Open(prevname)
x(err)
scanner := bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
count[aa[1]] = n
}
x(scanner.Err())
x(fp.Close())
fp, err = os.Open(filename)
x(err)
scanner = bufio.NewScanner(fp)
for scanner.Scan() {
aa := strings.SplitN(scanner.Text(), "\t", 2)
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
n1, ok := count[aa[1]]
items = append(items, Item{
text: aa[1],
score: n - n1,
isnew: !ok,
})
}
x(scanner.Err())
x(fp.Close())
sort.Slice(items, func(i, j int) bool {
/*
if items[i].isnew && !items[j].isnew {
return true
}
if !items[i].isnew && items[j].isnew {
return false
}
*/
if items[i].score != items[j].score {
return items[i].score > items[j].score
}
return items[i].text < items[j].text
})
for _, item := range items {
/*
if item.score < 2 {
break
}
*/
p := "."
if item.isnew {
p = "N"
}
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
}
}
func getPrev(filename string) string {
mm := reYearWeek.FindStringSubmatch(filename)
year, err := strconv.Atoi(mm[2][:4])
x(err)
week, err := strconv.Atoi(mm[2][5:])
x(err)
week--
if week == 0 {
week = 53
year--
}
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
if week == 53 {
_, err := os.Stat(newname)
if err == nil {
return newname
}
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
}
return newname
}

View File

@@ -25,6 +25,8 @@ func main() {
week, err := strconv.Atoi(aa[1])
x(err)
weken, err := strconv.Atoi(os.Args[2])
if year < 1000 || year > 9999 {
x(fmt.Errorf("ongeldig year: %d", year))
}
@@ -74,9 +76,9 @@ func main() {
}
t = t.AddDate(0, 0, 1-d)
// drie voorgaande weken
t2 := t.AddDate(0, 0, -21)
for i := range 4 {
// voorgaande weken en deze week
t2 := t.AddDate(0, 0, -7*(weken-1))
for i := range weken {
if i > 0 {
fmt.Print(" -or")
}
@@ -85,9 +87,9 @@ func main() {
t2 = t2.AddDate(0, 0, 7)
}
// vanaf begin drie weken geleden t/m eind huidige week
t = t.AddDate(0, 0, -21)
for range 28 {
// vanaf begin voorgaande weken t/m eind huidige week
t = t.AddDate(0, 0, -7*(weken-1))
for range 7 * weken {
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
t = t.AddDate(0, 0, 1)
}

View File

@@ -21,40 +21,61 @@ else
esac
fi
files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
cd /net/corpora/nlnieuws/data
alto \
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
for i in 1 4
do
alto \
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
files=$(find .. $(week2files $ds $i))
alto \
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds
alto \
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-namen-$ds-$i
alto \
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds
score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
alto \
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds
alto \
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-woorden-$ds-$i
alto \
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds
score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
alto \
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds
alto \
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-extra-$ds-$i
alto \
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds
score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
alto \
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > locaties-$ds-$i
score locaties-$ds-$i > locaties-$ds-$i.score
alto \
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > personen-$ds-$i
score personen-$ds-$i > personen-$ds-$i.score
alto \
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > organisaties-$ds-$i
score organisaties-$ds-$i > organisaties-$ds-$i.score
alto \
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-$ds-$i
score overigen-$ds-$i > overigen-$ds-$i.score
alto \
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-extra-$ds-$i
score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score
done