score; collect.sh
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -25,6 +25,7 @@ Tzum/xml2txt
|
|||||||
VRT/metadata
|
VRT/metadata
|
||||||
VRT/vrt
|
VRT/vrt
|
||||||
bin/ISOWeek
|
bin/ISOWeek
|
||||||
|
bin/score
|
||||||
bin/week2files
|
bin/week2files
|
||||||
20??
|
20??
|
||||||
corpus
|
corpus
|
||||||
|
|||||||
4
Makefile
4
Makefile
@@ -11,11 +11,15 @@ all:
|
|||||||
make -C Tzum
|
make -C Tzum
|
||||||
make -C VRT
|
make -C VRT
|
||||||
make bin/ISOWeek
|
make bin/ISOWeek
|
||||||
|
make bin/score
|
||||||
make bin/week2files
|
make bin/week2files
|
||||||
|
|
||||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||||
go build -o $@ $^
|
go build -o $@ $^
|
||||||
|
|
||||||
|
bin/score: cmd/score/*.go
|
||||||
|
go build -o $@ $^
|
||||||
|
|
||||||
bin/week2files: cmd/week2files/*.go
|
bin/week2files: cmd/week2files/*.go
|
||||||
go build -o $@ $^
|
go build -o $@ $^
|
||||||
|
|
||||||
|
|||||||
112
cmd/score/score.go
Normal file
112
cmd/score/score.go
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
e "codeberg.org/pebbe/errors"
|
||||||
|
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Item struct {
|
||||||
|
text string
|
||||||
|
score int
|
||||||
|
isnew bool
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
x = e.ExitErr
|
||||||
|
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||||
|
count = make(map[string]int)
|
||||||
|
items = make([]Item, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
filename := os.Args[1]
|
||||||
|
prevname := getPrev(filename)
|
||||||
|
|
||||||
|
fp, err := os.Open(prevname)
|
||||||
|
x(err)
|
||||||
|
scanner := bufio.NewScanner(fp)
|
||||||
|
for scanner.Scan() {
|
||||||
|
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||||
|
x(err)
|
||||||
|
count[aa[1]] = n
|
||||||
|
}
|
||||||
|
x(scanner.Err())
|
||||||
|
x(fp.Close())
|
||||||
|
|
||||||
|
fp, err = os.Open(filename)
|
||||||
|
x(err)
|
||||||
|
scanner = bufio.NewScanner(fp)
|
||||||
|
for scanner.Scan() {
|
||||||
|
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||||
|
x(err)
|
||||||
|
n1, ok := count[aa[1]]
|
||||||
|
items = append(items, Item{
|
||||||
|
text: aa[1],
|
||||||
|
score: n - n1,
|
||||||
|
isnew: !ok,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
x(scanner.Err())
|
||||||
|
x(fp.Close())
|
||||||
|
|
||||||
|
sort.Slice(items, func(i, j int) bool {
|
||||||
|
/*
|
||||||
|
if items[i].isnew && !items[j].isnew {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if !items[i].isnew && items[j].isnew {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
if items[i].score != items[j].score {
|
||||||
|
return items[i].score > items[j].score
|
||||||
|
}
|
||||||
|
return items[i].text < items[j].text
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, item := range items {
|
||||||
|
/*
|
||||||
|
if item.score < 2 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
p := "."
|
||||||
|
if item.isnew {
|
||||||
|
p = "N"
|
||||||
|
}
|
||||||
|
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func getPrev(filename string) string {
|
||||||
|
mm := reYearWeek.FindStringSubmatch(filename)
|
||||||
|
year, err := strconv.Atoi(mm[2][:4])
|
||||||
|
x(err)
|
||||||
|
week, err := strconv.Atoi(mm[2][5:])
|
||||||
|
x(err)
|
||||||
|
|
||||||
|
week--
|
||||||
|
if week == 0 {
|
||||||
|
week = 53
|
||||||
|
year--
|
||||||
|
}
|
||||||
|
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
|
||||||
|
if week == 53 {
|
||||||
|
_, err := os.Stat(newname)
|
||||||
|
if err == nil {
|
||||||
|
return newname
|
||||||
|
}
|
||||||
|
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
|
||||||
|
}
|
||||||
|
return newname
|
||||||
|
}
|
||||||
@@ -25,6 +25,8 @@ func main() {
|
|||||||
week, err := strconv.Atoi(aa[1])
|
week, err := strconv.Atoi(aa[1])
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
|
weken, err := strconv.Atoi(os.Args[2])
|
||||||
|
|
||||||
if year < 1000 || year > 9999 {
|
if year < 1000 || year > 9999 {
|
||||||
x(fmt.Errorf("ongeldig year: %d", year))
|
x(fmt.Errorf("ongeldig year: %d", year))
|
||||||
}
|
}
|
||||||
@@ -74,9 +76,9 @@ func main() {
|
|||||||
}
|
}
|
||||||
t = t.AddDate(0, 0, 1-d)
|
t = t.AddDate(0, 0, 1-d)
|
||||||
|
|
||||||
// drie voorgaande weken
|
// voorgaande weken en deze week
|
||||||
t2 := t.AddDate(0, 0, -21)
|
t2 := t.AddDate(0, 0, -7*(weken-1))
|
||||||
for i := range 4 {
|
for i := range weken {
|
||||||
if i > 0 {
|
if i > 0 {
|
||||||
fmt.Print(" -or")
|
fmt.Print(" -or")
|
||||||
}
|
}
|
||||||
@@ -85,9 +87,9 @@ func main() {
|
|||||||
t2 = t2.AddDate(0, 0, 7)
|
t2 = t2.AddDate(0, 0, 7)
|
||||||
}
|
}
|
||||||
|
|
||||||
// vanaf begin drie weken geleden t/m eind huidige week
|
// vanaf begin voorgaande weken t/m eind huidige week
|
||||||
t = t.AddDate(0, 0, -21)
|
t = t.AddDate(0, 0, -7*(weken-1))
|
||||||
for range 28 {
|
for range 7 * weken {
|
||||||
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
|
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
|
||||||
t = t.AddDate(0, 0, 1)
|
t = t.AddDate(0, 0, 1)
|
||||||
}
|
}
|
||||||
|
|||||||
77
collect.sh
77
collect.sh
@@ -21,40 +21,61 @@ else
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
|
|
||||||
|
|
||||||
cd /net/corpora/nlnieuws/data
|
cd /net/corpora/nlnieuws/data
|
||||||
|
|
||||||
alto \
|
for i in 1 4
|
||||||
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
do
|
||||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
|
|
||||||
|
|
||||||
alto \
|
files=$(find .. $(week2files $ds $i))
|
||||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
|
||||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
|
||||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
|
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
||||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-namen-$ds-$i
|
||||||
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds
|
|
||||||
|
|
||||||
alto \
|
score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
|
||||||
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
|
||||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds
|
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds
|
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||||
|
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-woorden-$ds-$i
|
||||||
|
|
||||||
alto \
|
score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
|
||||||
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
|
||||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds
|
|
||||||
|
|
||||||
alto \
|
alto \
|
||||||
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds
|
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||||
|
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-extra-$ds-$i
|
||||||
|
|
||||||
alto \
|
score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
|
||||||
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
|
||||||
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds
|
alto \
|
||||||
|
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > locaties-$ds-$i
|
||||||
|
|
||||||
|
score locaties-$ds-$i > locaties-$ds-$i.score
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > personen-$ds-$i
|
||||||
|
|
||||||
|
score personen-$ds-$i > personen-$ds-$i.score
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > organisaties-$ds-$i
|
||||||
|
|
||||||
|
score organisaties-$ds-$i > organisaties-$ds-$i.score
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
||||||
|
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-$ds-$i
|
||||||
|
|
||||||
|
score overigen-$ds-$i > overigen-$ds-$i.score
|
||||||
|
|
||||||
|
alto \
|
||||||
|
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
||||||
|
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-extra-$ds-$i
|
||||||
|
|
||||||
|
score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score
|
||||||
|
|
||||||
|
done
|
||||||
|
|||||||
Reference in New Issue
Block a user