diff --git a/cmd/data2json/data2json.go b/cmd/data2json/data2json.go
index 83f8763..6ce463a 100644
--- a/cmd/data2json/data2json.go
+++ b/cmd/data2json/data2json.go
@@ -27,18 +27,12 @@ type Data struct {
}
type Parts struct {
- NieuweNamen [][3]any `json:"nieuwe namen"`
- NieuweWoorden [][3]any `json:"nieuwe woorden"`
- Personen [][3]any `json:"personen"`
- AndereNamen [][3]any `json:"andere namen"`
- Locaties [][3]any `json:"locaties"`
- Organisaties [][3]any `json:"organisaties"`
-}
-
-type Value struct {
- Word string `json:"word"`
- Tags string `json:"tags"`
- Count int `json:"count"`
+ NieuweNamen [][5]any `json:"nieuwe namen"`
+ NieuweWoorden [][5]any `json:"nieuwe woorden"`
+ Personen [][5]any `json:"personen"`
+ AndereNamen [][5]any `json:"andere namen"`
+ Locaties [][5]any `json:"locaties"`
+ Organisaties [][5]any `json:"organisaties"`
}
var (
@@ -55,7 +49,7 @@ var (
suffix string
}{
"nieuwe namen": {"nieuwe-namen", ".t20"},
- "nieuwe woorden": {"nieuwe-woorden", ".t20"},
+ "nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
"personen": {"personen", ""},
"andere namen": {"overige-namen", ""},
"locaties": {"locaties", ""},
@@ -127,8 +121,8 @@ func makeParts(source string) *Parts {
}
}
-func makeValues(source, part string) [][3]any {
- v := make([][3]any, 0)
+func makeValues(source, part string) [][5]any {
+ v := make([][5]any, 0)
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
sources[source],
@@ -149,11 +143,17 @@ func makeValues(source, part string) [][3]any {
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
x(err)
word := aa[1]
- var tags string
+ var tags, lemma, postag string
if len(aa) > 2 {
tags = aa[2]
}
- v = append(v, [3]any{count, word, tags})
+ if len(aa) > 3 {
+ lemma = aa[3]
+ }
+ if len(aa) > 4 {
+ postag = aa[4]
+ }
+ v = append(v, [5]any{count, word, tags, lemma, postag})
if lineno == 20 {
break
}
@@ -173,7 +173,7 @@ func dates() (start, first, last string) {
// zoek juiste week
var y, w int
for {
- y, w = t.ISOWeek()
+ y, _ = t.ISOWeek()
if y < year {
t = t.AddDate(0, 12, 0)
continue
diff --git a/cmd/items2count/items2count.go b/cmd/items2count/items2count.go
index 30319e6..77d138a 100644
--- a/cmd/items2count/items2count.go
+++ b/cmd/items2count/items2count.go
@@ -48,6 +48,12 @@ func main() {
word := aa[0]
tags := aa[1]
lbl := aa[2]
+ if n := len(aa); n > 3 {
+ lbl = aa[n-1]
+ for i := 2; i < n-1; i++ {
+ word += "\t" + aa[i]
+ }
+ }
w, ok := words[word]
if !ok {
w = &Word{
@@ -87,7 +93,13 @@ func main() {
})
for _, w := range wordlist {
- fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags))
+ var tail string
+ i := strings.Index(w.word, "\t")
+ if i > 0 {
+ tail = w.word[i:]
+ w.word = w.word[:i]
+ }
+ fmt.Printf("%6d\t%s\t%s%s\n", w.count, w.word, getTag(w.tags), tail)
}
}
diff --git a/collect.sh b/collect.sh
index 3b49ef0..cf2e230 100755
--- a/collect.sh
+++ b/collect.sh
@@ -62,7 +62,11 @@ do
for i in 1 4
do
- files=$(find .. $(week2files $ds $i) | grep -E "$regex")
+ files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
+ if [ -z "$files" ]
+ then
+ continue
+ fi
# tellingen met tags
@@ -86,7 +90,7 @@ do
say $part-nieuwe-woorden-$ds-$i
alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
'tt:%w\t%d\t%I' $files \
| sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
@@ -125,18 +129,18 @@ do
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| items2count > $part-overige-namen-$ds-$i
- # tellingen met postags
+ # tellingen met tags en postags
say $part-nieuwe-woorden-extra-$ds-$i
alto \
- 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @
- his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
- 'tt:%w\t%l\t%P\t%I' $files \
+ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \
+ 'tt:%w\t%d\t%l\t%P\t%I' $files \
+ | sed -e 's/pubdate: "[-0-9]*"//' \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
- | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \
- | grep -v '^ *1 ' \
- | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
- > $part-nieuwe-woorden-extra-$ds-$i
+ | items2count > $part-nieuwe-woorden-extra-$ds-$i
+ top20 $part-nieuwe-woorden-extra-$ds-$i
+
+ # tellingen met postags
say $part-nieuwe-adjww-extra-$ds-$i
alto \
diff --git a/namen.sh b/namen.sh
index 7222c46..ff2fadf 100755
--- a/namen.sh
+++ b/namen.sh
@@ -76,13 +76,13 @@ case $XN in
;;
2)
# nieuwe woorden
- EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
+ EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
TEMPLATE='tt:%w'
XVALID=1
;;
3)
# nieuwe woorden met postag en lemma
- EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
+ EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]'
TEMPLATE='tt:%w\t%l\t%P'
XVALID=1
;;
diff --git a/www/app.js b/www/app.js
index 4138724..c0e20cd 100644
--- a/www/app.js
+++ b/www/app.js
@@ -56,7 +56,7 @@ function makeTD(title, values) {
if (i < values.length) {
value = values[i]
} else {
- value = [0, '\xa0', '']
+ value = [0, '\xa0', '', '', '']
}
if (i == 0) {
max = value[0]
@@ -68,9 +68,24 @@ function makeTD(title, values) {
tr.classList.add('tags')
t2 = '
' + escape(value[2]) + ''
}
+ var t3 = ''
+ if (value[3] && value[4]) {
+ t3 =
+ '