diff --git a/cmd/data2json/data2json.go b/cmd/data2json/data2json.go index 83f8763..6ce463a 100644 --- a/cmd/data2json/data2json.go +++ b/cmd/data2json/data2json.go @@ -27,18 +27,12 @@ type Data struct { } type Parts struct { - NieuweNamen [][3]any `json:"nieuwe namen"` - NieuweWoorden [][3]any `json:"nieuwe woorden"` - Personen [][3]any `json:"personen"` - AndereNamen [][3]any `json:"andere namen"` - Locaties [][3]any `json:"locaties"` - Organisaties [][3]any `json:"organisaties"` -} - -type Value struct { - Word string `json:"word"` - Tags string `json:"tags"` - Count int `json:"count"` + NieuweNamen [][5]any `json:"nieuwe namen"` + NieuweWoorden [][5]any `json:"nieuwe woorden"` + Personen [][5]any `json:"personen"` + AndereNamen [][5]any `json:"andere namen"` + Locaties [][5]any `json:"locaties"` + Organisaties [][5]any `json:"organisaties"` } var ( @@ -55,7 +49,7 @@ var ( suffix string }{ "nieuwe namen": {"nieuwe-namen", ".t20"}, - "nieuwe woorden": {"nieuwe-woorden", ".t20"}, + "nieuwe woorden": {"nieuwe-woorden-extra", ".t20"}, "personen": {"personen", ""}, "andere namen": {"overige-namen", ""}, "locaties": {"locaties", ""}, @@ -127,8 +121,8 @@ func makeParts(source string) *Parts { } } -func makeValues(source, part string) [][3]any { - v := make([][3]any, 0) +func makeValues(source, part string) [][5]any { + v := make([][5]any, 0) filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s", sources[source], @@ -149,11 +143,17 @@ func makeValues(source, part string) [][3]any { count, err := strconv.Atoi(strings.TrimSpace(aa[0])) x(err) word := aa[1] - var tags string + var tags, lemma, postag string if len(aa) > 2 { tags = aa[2] } - v = append(v, [3]any{count, word, tags}) + if len(aa) > 3 { + lemma = aa[3] + } + if len(aa) > 4 { + postag = aa[4] + } + v = append(v, [5]any{count, word, tags, lemma, postag}) if lineno == 20 { break } @@ -173,7 +173,7 @@ func dates() (start, first, last string) { // zoek juiste week var y, w int for { - y, w = t.ISOWeek() + y, _ = t.ISOWeek() if y < year { t = t.AddDate(0, 12, 0) continue diff --git a/cmd/items2count/items2count.go b/cmd/items2count/items2count.go index 30319e6..77d138a 100644 --- a/cmd/items2count/items2count.go +++ b/cmd/items2count/items2count.go @@ -48,6 +48,12 @@ func main() { word := aa[0] tags := aa[1] lbl := aa[2] + if n := len(aa); n > 3 { + lbl = aa[n-1] + for i := 2; i < n-1; i++ { + word += "\t" + aa[i] + } + } w, ok := words[word] if !ok { w = &Word{ @@ -87,7 +93,13 @@ func main() { }) for _, w := range wordlist { - fmt.Printf("%6d\t%s\t%s\n", w.count, w.word, getTag(w.tags)) + var tail string + i := strings.Index(w.word, "\t") + if i > 0 { + tail = w.word[i:] + w.word = w.word[:i] + } + fmt.Printf("%6d\t%s\t%s%s\n", w.count, w.word, getTag(w.tags), tail) } } diff --git a/collect.sh b/collect.sh index 3b49ef0..cf2e230 100755 --- a/collect.sh +++ b/collect.sh @@ -62,7 +62,11 @@ do for i in 1 4 do - files=$(find .. $(week2files $ds $i) | grep -E "$regex") + files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true + if [ -z "$files" ] + then + continue + fi # tellingen met tags @@ -86,7 +90,7 @@ do say $part-nieuwe-woorden-$ds-$i alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ 'tt:%w\t%d\t%I' $files \ | sed -e 's/pubdate: "[-0-9]*"//' \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ @@ -125,18 +129,18 @@ do | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | items2count > $part-overige-namen-$ds-$i - # tellingen met postags + # tellingen met tags en postags say $part-nieuwe-woorden-extra-$ds-$i alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ - his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - 'tt:%w\t%l\t%P\t%I' $files \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'tt:%w\t%d\t%l\t%P\t%I' $files \ + | sed -e 's/pubdate: "[-0-9]*"//' \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\(.*\)\t.*/\1/' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-nieuwe-woorden-extra-$ds-$i + | items2count > $part-nieuwe-woorden-extra-$ds-$i + top20 $part-nieuwe-woorden-extra-$ds-$i + + # tellingen met postags say $part-nieuwe-adjww-extra-$ds-$i alto \ diff --git a/namen.sh b/namen.sh index 7222c46..ff2fadf 100755 --- a/namen.sh +++ b/namen.sh @@ -76,13 +76,13 @@ case $XN in ;; 2) # nieuwe woorden - EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' + EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' TEMPLATE='tt:%w' XVALID=1 ;; 3) # nieuwe woorden met postag en lemma - EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' + EXPR='fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="decap" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' TEMPLATE='tt:%w\t%l\t%P' XVALID=1 ;; diff --git a/www/app.js b/www/app.js index 4138724..c0e20cd 100644 --- a/www/app.js +++ b/www/app.js @@ -56,7 +56,7 @@ function makeTD(title, values) { if (i < values.length) { value = values[i] } else { - value = [0, '\xa0', ''] + value = [0, '\xa0', '', '', ''] } if (i == 0) { max = value[0] @@ -68,9 +68,24 @@ function makeTD(title, values) { tr.classList.add('tags') t2 = '
' + escape(value[2]) + '' } + var t3 = '' + if (value[3] && value[4]) { + t3 = + '
lemma: ' + + escape(value[3]) + + '
postag: ' + + escape(value[4]) + + '
' + } tr.setAttribute( 'onmouseover', - "tooltip.show('" + value[0] + ' \xa0 ' + escape(value[1]) + t2 + "')", + "tooltip.show('" + + value[0] + + ' \xa0 ' + + escape(value[1]) + + t2 + + t3 + + "')", ) tr.setAttribute('onmouseout', 'tooltip.hide()') } @@ -97,7 +112,7 @@ async function loadSource(source, week) { } document.getElementById('subtitle').innerHTML = - source + ' — ' + data[week].year + ' week ' + data[week].week + source + '
' + data[week].start + ' t/m ' + data[week].last const d = document.createElement('div') const tab = document.createElement('table') @@ -118,7 +133,7 @@ async function loadPart(part, week) { } document.getElementById('subtitle').innerHTML = - part + ' — ' + data[week].year + ' week ' + data[week].week + part + '
' + data[week].start + ' t/m ' + data[week].last const d = document.createElement('div') const tab = document.createElement('table') @@ -148,9 +163,7 @@ async function loadWeken(source, part) { data[week] = await getJSON('DATA-' + week + '-4.json') } var values = data[week][source][part] - tr.appendChild( - makeTD(data[week].year + ' week ' + data[week].week, values), - ) + tr.appendChild(makeTD('t/m ' + data[week].last, values)) } } diff --git a/www/style2.css b/www/style2.css index 2d439a4..968e801 100644 --- a/www/style2.css +++ b/www/style2.css @@ -146,6 +146,6 @@ label:hover { transition: all 200ms linear; } #data.fade { - background-color: #ffa54f; + background-color: #00fa9a; transition: all 20ms linear; }