score; collect.sh

2026-03-19 12:37:35 +01:00
parent 01438b69ac
commit 025c134c07
5 changed files with 174 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ Tzum/xml2txt
 VRT/metadata
 VRT/vrt
 bin/ISOWeek
+bin/score
 bin/week2files
 20??
 corpus
--- a/4
+++ b/4
@@ -11,11 +11,15 @@ all:
 	make -C Tzum
 	make -C VRT
 	make bin/ISOWeek
+	make bin/score
 	make bin/week2files

 bin/ISOWeek: cmd/ISOWeek/*.go
 	go build -o $@ $^

+bin/score: cmd/score/*.go
+	go build -o $@ $^
+
 bin/week2files: cmd/week2files/*.go
 	go build -o $@ $^

--- a/cmd/score/score.go
+++ b/cmd/score/score.go
@@ -0,0 +1,112 @@
+package main
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"bufio"
+	"fmt"
+	"os"
+	"regexp"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+type Item struct {
+	text  string
+	score int
+	isnew bool
+}
+
+var (
+	x          = e.ExitErr
+	reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
+	count      = make(map[string]int)
+	items      = make([]Item, 0)
+)
+
+func main() {
+	filename := os.Args[1]
+	prevname := getPrev(filename)
+
+	fp, err := os.Open(prevname)
+	x(err)
+	scanner := bufio.NewScanner(fp)
+	for scanner.Scan() {
+		aa := strings.SplitN(scanner.Text(), "\t", 2)
+		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
+		x(err)
+		count[aa[1]] = n
+	}
+	x(scanner.Err())
+	x(fp.Close())
+
+	fp, err = os.Open(filename)
+	x(err)
+	scanner = bufio.NewScanner(fp)
+	for scanner.Scan() {
+		aa := strings.SplitN(scanner.Text(), "\t", 2)
+		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
+		x(err)
+		n1, ok := count[aa[1]]
+		items = append(items, Item{
+			text:  aa[1],
+			score: n - n1,
+			isnew: !ok,
+		})
+	}
+	x(scanner.Err())
+	x(fp.Close())
+
+	sort.Slice(items, func(i, j int) bool {
+		/*
+			if items[i].isnew && !items[j].isnew {
+				return true
+			}
+			if !items[i].isnew && items[j].isnew {
+				return false
+			}
+		*/
+		if items[i].score != items[j].score {
+			return items[i].score > items[j].score
+		}
+		return items[i].text < items[j].text
+	})
+
+	for _, item := range items {
+		/*
+			if item.score < 2 {
+				break
+			}
+		*/
+		p := "."
+		if item.isnew {
+			p = "N"
+		}
+		fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
+	}
+
+}
+
+func getPrev(filename string) string {
+	mm := reYearWeek.FindStringSubmatch(filename)
+	year, err := strconv.Atoi(mm[2][:4])
+	x(err)
+	week, err := strconv.Atoi(mm[2][5:])
+	x(err)
+
+	week--
+	if week == 0 {
+		week = 53
+		year--
+	}
+	newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
+	if week == 53 {
+		_, err := os.Stat(newname)
+		if err == nil {
+			return newname
+		}
+		newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
+	}
+	return newname
+}
--- a/cmd/week2files/week2files.go
+++ b/cmd/week2files/week2files.go
@@ -25,6 +25,8 @@ func main() {
 	week, err := strconv.Atoi(aa[1])
 	x(err)

+	weken, err := strconv.Atoi(os.Args[2])
+
 	if year < 1000 || year > 9999 {
 		x(fmt.Errorf("ongeldig year: %d", year))
 	}
@@ -74,9 +76,9 @@ func main() {
 	}
 	t = t.AddDate(0, 0, 1-d)

-	// drie voorgaande weken
-	t2 := t.AddDate(0, 0, -21)
-	for i := range 4 {
+	// voorgaande weken en deze week
+	t2 := t.AddDate(0, 0, -7*(weken-1))
+	for i := range weken {
 		if i > 0 {
 			fmt.Print(" -or")
 		}
@@ -85,9 +87,9 @@ func main() {
 		t2 = t2.AddDate(0, 0, 7)
 	}

-	// vanaf begin drie weken geleden t/m eind huidige week
-	t = t.AddDate(0, 0, -21)
-	for range 28 {
+	// vanaf begin voorgaande weken t/m eind huidige week
+	t = t.AddDate(0, 0, -7*(weken-1))
+	for range 7 * weken {
 		fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day())
 		t = t.AddDate(0, 0, 1)
 	}
--- a/collect.sh
+++ b/collect.sh
@@ -21,40 +21,61 @@ else
    esac
 fi

-files=$(find /net/corpora/nlnieuws/ $(week2files $ds))
-
 cd /net/corpora/nlnieuws/data

-alto \
-    'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
-    tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds
+for i in 1 4
+do

-alto \
-    'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
-his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
-    tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds
+    files=$(find .. $(week2files $ds $i))

-alto \
-    'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
-his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
-    'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > nieuw-extra-$ds
+    alto \
+        'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \
+        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-namen-$ds-$i

-alto \
-    'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > locaties-$ds
+    score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score

-alto \
-    'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > personen-$ds
+    alto \
+        'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
+    his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+        tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-woorden-$ds-$i

-alto \
-    'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > organisatie-$ds
+    score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score

-alto \
-    'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
-    tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > overigen-$ds
+    alto \
+        'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
+    his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+        'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > nieuw-extra-$ds-$i

-alto \
-    'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
-    'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sort -nr > overigen-extra-$ds
+    score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score
+
+    alto \
+        'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \
+        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > locaties-$ds-$i
+
+    score locaties-$ds-$i > locaties-$ds-$i.score
+
+    alto \
+        'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \
+        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > personen-$ds-$i
+
+    score personen-$ds-$i > personen-$ds-$i.score
+
+    alto \
+        'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \
+        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > organisaties-$ds-$i
+
+    score organisaties-$ds-$i > organisaties-$ds-$i.score
+
+    alto \
+        'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \
+        tt:%l $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > overigen-$ds-$i
+
+    score overigen-$ds-$i > overigen-$ds-$i.score
+
+    alto \
+        'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \
+        'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 '  | sed -e 's/\([0-9]\)  */\1\t/' | sort -nr > overigen-extra-$ds-$i
+
+    score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score
+
+done