score; collect.sh
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -25,6 +25,7 @@ Tzum/xml2txt
|
||||
VRT/metadata
|
||||
VRT/vrt
|
||||
bin/ISOWeek
|
||||
bin/score
|
||||
bin/week2files
|
||||
20??
|
||||
corpus
|
||||
|
||||
4
Makefile
4
Makefile
@@ -11,11 +11,15 @@ all:
|
||||
make -C Tzum
|
||||
make -C VRT
|
||||
make bin/ISOWeek
|
||||
make bin/score
|
||||
make bin/week2files
|
||||
|
||||
bin/ISOWeek: cmd/ISOWeek/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/score: cmd/score/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/week2files: cmd/week2files/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
|
||||
112
cmd/score/score.go
Normal file
112
cmd/score/score.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Item struct {
|
||||
text string
|
||||
score int
|
||||
isnew bool
|
||||
}
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||
count = make(map[string]int)
|
||||
items = make([]Item, 0)
|
||||
)
|
||||
|
||||
func main() {
|
||||
filename := os.Args[1]
|
||||
prevname := getPrev(filename)
|
||||
|
||||
fp, err := os.Open(prevname)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
count[aa[1]] = n
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
|
||||
fp, err = os.Open(filename)
|
||||
x(err)
|
||||
scanner = bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
aa := strings.SplitN(scanner.Text(), "\t", 2)
|
||||
n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
n1, ok := count[aa[1]]
|
||||
items = append(items, Item{
|
||||
text: aa[1],
|
||||
score: n - n1,
|
||||
isnew: !ok,
|
||||
})
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
|
||||
sort.Slice(items, func(i, j int) bool {
|
||||
/*
|
||||
if items[i].isnew && !items[j].isnew {
|
||||
return true
|
||||
}
|
||||
if !items[i].isnew && items[j].isnew {
|
||||
return false
|
||||
}
|
||||
*/
|
||||
if items[i].score != items[j].score {
|
||||
return items[i].score > items[j].score
|
||||
}
|
||||
return items[i].text < items[j].text
|
||||
})
|
||||
|
||||
for _, item := range items {
|
||||
/*
|
||||
if item.score < 2 {
|
||||
break
|
||||
}
|
||||
*/
|
||||
p := "."
|
||||
if item.isnew {
|
||||
p = "N"
|
||||
}
|
||||
fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func getPrev(filename string) string {
|
||||
mm := reYearWeek.FindStringSubmatch(filename)
|
||||
year, err := strconv.Atoi(mm[2][:4])
|
||||
x(err)
|
||||
week, err := strconv.Atoi(mm[2][5:])
|
||||
x(err)
|
||||
|
||||
week--
|
||||
if week == 0 {
|
||||
week = 53
|
||||
year--
|
||||
}
|
||||
newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
|
||||
if week == 53 {
|
||||
_, err := os.Stat(newname)
|
||||
if err == nil {
|
||||
return newname
|
||||
}
|
||||
newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
|
||||
}
|
||||
return newname
|
||||
}
|
||||
@@ -25,6 +25,8 @@ func main() {
|
||||
week, err := strconv.Atoi(aa[1])
|
||||
x(err)
|
||||
|
||||
weken, err := strconv.Atoi(os.Args[2])
|
||||
|
||||
if year < 1000 || year > 9999 {
|
||||
x(fmt.Errorf("ongeldig year: %d", year))
|
||||
}
|
||||
@@ -74,9 +76,9 @@ func main() {
|
||||
}
|
||||
t = t.AddDate(0, 0, 1-d)
|
||||
|
||||
// drie voorgaande weken
|
||||
t2 := t.AddDate(0, 0, -21)
|
||||
for i := range 4 {
|
||||
// voorgaande weken en deze week
|
||||
t2 := t.AddDate(0, 0, -7*(weken-1))
|
||||
for i := range weken {
|
||||
if i > 0 {
|
||||
fmt.Print(" -or")
|
||||
}
|
||||
@@ -85,9 +87,9 @@ func main() {
|
||||
t2 = t2.AddDate(0, 0, 7)
|
||||
}
|
||||
|
||||
// vanaf begin drie weken geleden t/m eind huidige week
|
||||
t = t.AddDate(0, 0, -21)
|
||||
for range 28 {
|
||||
// vanaf begin voorgaande weken t/m eind huidige week
|
||||
t = t.AddDate(0, 0, -7*(weken-1))
|
||||
for range 7 * weken {
|
||||
fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
|
||||
t = t.AddDate(0, 0, 1)
|
||||
}
|
||||
|
||||
77
collect.sh
77
collect.sh
@@ -21,40 +21,61 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
|
||||
|
||||
cd /net/corpora/nlnieuws/data
|
||||
|
||||
alto \
|
||||
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
|
||||
for i in 1 4
|
||||
do
|
||||
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
|
||||
files=$(find .. $(week2files $ds $i))
|
||||
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds
|
||||
alto \
|
||||
'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-namen-$ds-$i
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds
|
||||
score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-woorden-$ds-$i
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds
|
||||
score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds
|
||||
alto \
|
||||
'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
|
||||
his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
|
||||
'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-extra-$ds-$i
|
||||
|
||||
alto \
|
||||
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
||||
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds
|
||||
score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > locaties-$ds-$i
|
||||
|
||||
score locaties-$ds-$i > locaties-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > personen-$ds-$i
|
||||
|
||||
score personen-$ds-$i > personen-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > organisaties-$ds-$i
|
||||
|
||||
score organisaties-$ds-$i > organisaties-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
|
||||
tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-$ds-$i
|
||||
|
||||
score overigen-$ds-$i > overigen-$ds-$i.score
|
||||
|
||||
alto \
|
||||
'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
|
||||
'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-extra-$ds-$i
|
||||
|
||||
score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score
|
||||
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user