diverse aanpassingen

2026-04-22 19:02:34 +02:00
parent c6c2abb387
commit 9d974b3725
6 changed files with 68 additions and 39 deletions
--- a/cmd/data2json/data2json.go
+++ b/cmd/data2json/data2json.go
@@ -27,18 +27,12 @@ type Data struct {
 }

 type Parts struct {
-	NieuweNamen   [][3]any `json:"nieuwe namen"`
-	NieuweWoorden [][3]any `json:"nieuwe woorden"`
-	Personen      [][3]any `json:"personen"`
-	AndereNamen   [][3]any `json:"andere namen"`
-	Locaties      [][3]any `json:"locaties"`
-	Organisaties  [][3]any `json:"organisaties"`
-}
-
-type Value struct {
-	Word  string `json:"word"`
-	Tags  string `json:"tags"`
-	Count int    `json:"count"`
+	NieuweNamen   [][5]any `json:"nieuwe namen"`
+	NieuweWoorden [][5]any `json:"nieuwe woorden"`
+	Personen      [][5]any `json:"personen"`
+	AndereNamen   [][5]any `json:"andere namen"`
+	Locaties      [][5]any `json:"locaties"`
+	Organisaties  [][5]any `json:"organisaties"`
 }

 var (
@@ -55,7 +49,7 @@ var (
 		suffix string
 	}{
 		"nieuwe namen":   {"nieuwe-namen", ".t20"},
-		"nieuwe woorden": {"nieuwe-woorden", ".t20"},
+		"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
 		"personen":       {"personen", ""},
 		"andere namen":   {"overige-namen", ""},
 		"locaties":       {"locaties", ""},
@@ -127,8 +121,8 @@ func makeParts(source string) *Parts {
 	}
 }

-func makeValues(source, part string) [][3]any {
-	v := make([][3]any, 0)
+func makeValues(source, part string) [][5]any {
+	v := make([][5]any, 0)

 	filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
 		sources[source],
@@ -149,11 +143,17 @@ func makeValues(source, part string) [][3]any {
 		count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
 		x(err)
 		word := aa[1]
-		var tags string
+		var tags, lemma, postag string
 		if len(aa) > 2 {
 			tags = aa[2]
 		}
-		v = append(v, [3]any{count, word, tags})
+		if len(aa) > 3 {
+			lemma = aa[3]
+		}
+		if len(aa) > 4 {
+			postag = aa[4]
+		}
+		v = append(v, [5]any{count, word, tags, lemma, postag})
 		if lineno == 20 {
 			break
 		}
@@ -173,7 +173,7 @@ func dates() (start, first, last string) {
 	// zoek juiste week
 	var y, w int
 	for {
-		y, w = t.ISOWeek()
+		y, _ = t.ISOWeek()
 		if y < year {
 			t = t.AddDate(0, 12, 0)
 			continue
--- a/cmd/items2count/items2count.go
+++ b/cmd/items2count/items2count.go
@@ -48,6 +48,12 @@ func main() {
 		word := aa[0]
 		tags := aa[1]
 		lbl := aa[2]
+		if n := len(aa); n > 3 {
+			lbl = aa[n-1]
+			for i := 2; i < n-1; i++ {
+				word += "\t" + aa[i]
+			}
+		}
 		w, ok := words[word]
 		if !ok {
 			w = &Word{
@@ -87,7 +93,13 @@ func main() {
 	})

 	for _, w := range wordlist {
-		fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags))
+		var tail string
+		i := strings.Index(w.word, "\t")
+		if i > 0 {
+			tail = w.word[i:]
+			w.word = w.word[:i]
+		}
+		fmt.Printf("%6d\t%s\t%s%s\n", w.count, w.word, getTag(w.tags), tail)
 	}

 }
--- a/collect.sh
+++ b/collect.sh
@@ -62,7 +62,11 @@ do

    for i in 1 4
    do
-        files=$(find .. $(week2files $ds $i) | grep -E "$regex")
+        files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
+        if [ -z "$files" ]
+        then
+            continue
+        fi

        # tellingen met tags

@@ -86,7 +90,7 @@ do

        say $part-nieuwe-woorden-$ds-$i
        alto \
-            'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+            'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
            'tt:%w\t%d\t%I' $files \
            | sed -e 's/pubdate: "[-0-9]*"//' \
            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
@@ -125,18 +129,18 @@ do
            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
            | items2count > $part-overige-namen-$ds-$i

-        # tellingen met postags
+        # tellingen met tags en postags

        say $part-nieuwe-woorden-extra-$ds-$i
        alto \
-            'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
-        his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
-            'tt:%w\t%l\t%P\t%I' $files \
+            'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+            'tt:%w\t%d\t%l\t%P\t%I' $files \
+            | sed -e 's/pubdate: "[-0-9]*"//' \
            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
-            | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \
-            | grep -v '^ *1 ' \
-            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
-            > $part-nieuwe-woorden-extra-$ds-$i
+            | items2count > $part-nieuwe-woorden-extra-$ds-$i
+        top20 $part-nieuwe-woorden-extra-$ds-$i
+
+        # tellingen met postags

        say $part-nieuwe-adjww-extra-$ds-$i
        alto \
--- a/namen.sh
+++ b/namen.sh
@@ -76,13 +76,13 @@ case $XN in
        ;;
    2)
        # nieuwe woorden
-        EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
+        EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
        TEMPLATE='tt:%w'
        XVALID=1
        ;;
    3)
        # nieuwe woorden met postag en lemma
-        EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
+        EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
        TEMPLATE='tt:%w\t%l\t%P'
        XVALID=1
        ;;
--- a/www/app.js
+++ b/www/app.js
@@ -56,7 +56,7 @@ function makeTD(title, values) {
    if (i < values.length) {
      value = values[i]
    } else {
-      value = [0, '\xa0', '']
+      value = [0, '\xa0', '', '', '']
    }
    if (i == 0) {
      max = value[0]
@@ -68,9 +68,24 @@ function makeTD(title, values) {
        tr.classList.add('tags')
        t2 = '<br><small>' + escape(value[2]) + '</small>'
      }
+      var t3 = ''
+      if (value[3] && value[4]) {
+        t3 =
+          '<hr><small><em>lemma:</em> ' +
+          escape(value[3]) +
+          '<br><em>postag:</em> ' +
+          escape(value[4]) +
+          '</small>'
+      }
      tr.setAttribute(
        'onmouseover',
-        "tooltip.show('" + value[0] + ' \xa0 ' + escape(value[1]) + t2 + "')",
+        "tooltip.show('" +
+          value[0] +
+          ' \xa0 ' +
+          escape(value[1]) +
+          t2 +
+          t3 +
+          "')",
      )
      tr.setAttribute('onmouseout', 'tooltip.hide()')
    }
@@ -97,7 +112,7 @@ async function loadSource(source, week) {
  }

  document.getElementById('subtitle').innerHTML =
-    source + ' — ' + data[week].year + ' week ' + data[week].week
+    source + '<br>' + data[week].start + ' t/m ' + data[week].last

  const d = document.createElement('div')
  const tab = document.createElement('table')
@@ -118,7 +133,7 @@ async function loadPart(part, week) {
  }

  document.getElementById('subtitle').innerHTML =
-    part + ' — ' + data[week].year + ' week ' + data[week].week
+    part + '<br>' + data[week].start + ' t/m ' + data[week].last

  const d = document.createElement('div')
  const tab = document.createElement('table')
@@ -148,9 +163,7 @@ async function loadWeken(source, part) {
        data[week] = await getJSON('DATA-' + week + '-4.json')
      }
      var values = data[week][source][part]
-      tr.appendChild(
-        makeTD(data[week].year + ' week ' + data[week].week, values),
-      )
+      tr.appendChild(makeTD('t/m ' + data[week].last, values))
    }
  }

--- a/www/style2.css
+++ b/www/style2.css
@@ -146,6 +146,6 @@ label:hover {
  transition: all 200ms linear;
 }
 #data.fade {
-  background-color: #ffa54f;
+  background-color: #00fa9a;
  transition: all 20ms linear;
 }