score; collect.sh

2026-03-19 12:37:35 +01:00
parent 01438b69ac
commit 025c134c07
5 changed files with 174 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ Tzum/xml2txt
 VRT/metadata
 VRT/vrt
 bin/ISOWeek
 bin/score
 bin/week2files
 20??
 corpus
--- a/4
+++ b/4
@@ -11,11 +11,15 @@ all:
 	make -C Tzum
 	make -C VRT
 	make bin/ISOWeek
 	make bin/score
 	make bin/week2files
 bin/ISOWeek: cmd/ISOWeek/*.go
 	go build -o $@ $^
 bin/score: cmd/score/*.go
 	go build -o $@ $^
 bin/week2files: cmd/week2files/*.go
 	go build -o $@ $^
--- a/cmd/score/score.go
+++ b/cmd/score/score.go
@@ -0,0 +1,112 @@
 package main
 import (
 	e "codeberg.org/pebbe/errors"
 	"bufio"
 	"fmt"
 	"os"
 	"regexp"
 	"sort"
 	"strconv"
 	"strings"
 )
 type Item struct {
 	text  string
 	score int
 	isnew bool
 }
 var (
 	x          = e.ExitErr
 	reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
 	count      = make(map[string]int)
 	items      = make([]Item, 0)
 )
 func main() {
 	filename := os.Args[1]
 	prevname := getPrev(filename)
 	fp, err := os.Open(prevname)
 	x(err)
 	scanner := bufio.NewScanner(fp)
 	for scanner.Scan() {
 		aa := strings.SplitN(scanner.Text(), "\t", 2)
 		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
 		x(err)
 		count[aa[1]] = n
 	}
 	x(scanner.Err())
 	x(fp.Close())
 	fp, err = os.Open(filename)
 	x(err)
 	scanner = bufio.NewScanner(fp)
 	for scanner.Scan() {
 		aa := strings.SplitN(scanner.Text(), "\t", 2)
 		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
 		x(err)
 		n1, ok := count[aa[1]]
 		items = append(items, Item{
 			text:  aa[1],
 			score: n - n1,
 			isnew: !ok,
 		})
 	}
 	x(scanner.Err())
 	x(fp.Close())
 	sort.Slice(items, func(i, j int) bool {
 		/*
 			if items[i].isnew && !items[j].isnew {
 				return true
 			}
 			if !items[i].isnew && items[j].isnew {
 				return false
 			}
 		*/
 		if items[i].score != items[j].score {
 			return items[i].score > items[j].score
 		}
 		return items[i].text < items[j].text
 	})
 	for _, item := range items {
 		/*
 			if item.score < 2 {
 				break
 			}
 		*/
 		p := "."
 		if item.isnew {
 			p = "N"
 		}
 		fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
 	}
 }
 func getPrev(filename string) string {
 	mm := reYearWeek.FindStringSubmatch(filename)
 	year, err := strconv.Atoi(mm[2][:4])
 	x(err)
 	week, err := strconv.Atoi(mm[2][5:])
 	x(err)
 	week--
 	if week == 0 {
 		week = 53
 		year--
 	}
 	newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
 	if week == 53 {
 		_, err := os.Stat(newname)
 		if err == nil {
 			return newname
 		}
 		newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
 	}
 	return newname
 }
--- a/cmd/week2files/week2files.go
+++ b/cmd/week2files/week2files.go
@@ -25,6 +25,8 @@ func main() {
 	week, err := strconv.Atoi(aa[1])
 	x(err)
 	weken, err := strconv.Atoi(os.Args[2])
 	if year < 1000 || year > 9999 {
 		x(fmt.Errorf("ongeldig year: %d", year))
 	}
@@ -74,9 +76,9 @@ func main() {
 	}
 	t = t.AddDate(0, 0, 1-d)
-	// drie voorgaande weken
+	// voorgaande weken en deze week
-	t2 := t.AddDate(0, 0, -21)
+	t2 := t.AddDate(0, 0, -7*(weken-1))
-	for i := range 4 {
+	for i := range weken {
 		if i > 0 {
 			fmt.Print(" -or")
 		}
@@ -85,9 +87,9 @@ func main() {
 		t2 = t2.AddDate(0, 0, 7)
 	}
-	// vanaf begin drie weken geleden t/m eind huidige week
+	// vanaf begin voorgaande weken t/m eind huidige week
-	t = t.AddDate(0, 0, -21)
+	t = t.AddDate(0, 0, -7*(weken-1))
-	for range 28 {
+	for range 7 * weken {
 		fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
 		t = t.AddDate(0, 0, 1)
 	}
--- a/collect.sh
+++ b/collect.sh
@@ -21,40 +21,61 @@ else
    esac
 fi
 files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
 cd /net/corpora/nlnieuws/data
-alto \
+for i in 1 4
-    'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
+do
    tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
-alto \
+    files=$(find .. $(week2files $ds $i))
    'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
 his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
    tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
-alto \
+    alto \
-    'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
+        'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
-his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-namen-$ds-$i
    'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > nieuw-extra-$ds
-alto \
+    score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score
    'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > locaties-$ds
-alto \
+    alto \
-    'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
+        'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > personen-$ds
+    his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-woorden-$ds-$i
-alto \
+    score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score
    'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > organisatie-$ds
-alto \
+    alto \
-    'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
+        'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > overigen-$ds
+    his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
        'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-extra-$ds-$i
-alto \
+    score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
-    'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
+
-    'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > overigen-extra-$ds
+    alto \
        'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > locaties-$ds-$i
    score locaties-$ds-$i > locaties-$ds-$i.score
    alto \
        'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > personen-$ds-$i
    score personen-$ds-$i > personen-$ds-$i.score
    alto \
        'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > organisaties-$ds-$i
    score organisaties-$ds-$i > organisaties-$ds-$i.score
    alto \
        'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > overigen-$ds-$i
    score overigen-$ds-$i > overigen-$ds-$i.score
    alto \
        'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
        'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > overigen-extra-$ds-$i
    score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score
 done