This commit is contained in:
Peter Kleiweg
2026-07-02 17:19:30 +02:00
parent 73c21b4f8f
commit 2466dbb43c
7 changed files with 738 additions and 3 deletions

1
.gitignore vendored
View File

@@ -43,6 +43,7 @@ Volkskrant/volkskrant
VRT/metadata
VRT/vrt
bin/data2json
bin/data22json
bin/dates2json
bin/flush
bin/items2count

View File

@@ -19,6 +19,7 @@ all:
make -C Volkskrant
make -C VRT
make bin/data2json
make bin/data22json
make bin/dates2json
make bin/flush
make bin/items2count
@@ -31,6 +32,9 @@ all:
bin/data2json: cmd/data2json/*.go
go build -o $@ $^
bin/data22json: cmd/data22json/*.go
go build -o $@ $^
bin/dates2json: cmd/dates2json/*.go
go build -o $@ $^

View File

@@ -0,0 +1,244 @@
package main
import (
e "codeberg.org/pebbe/errors"
"encoding/json"
"fmt"
"os"
"regexp"
"strconv"
"strings"
"time"
)
type Data struct {
Year int `json:"year"`
Week int `json:"week"`
First string `json:"first"`
Last string `json:"last"`
Period int `json:"period"`
Start string `json:"start"`
Max int `json:"max"`
Sources map[string]int `json:"sources"`
Algemeen *Parts `json:"Algemeen"`
Groningen *Parts `json:"Groningen"`
Amsterdam *Parts `json:"Amsterdam"`
Literatuur *Parts `json:"Literatuur"`
Vlaanderen *Parts `json:"Vlaanderen"`
}
type Parts struct {
Woorden [][5]any `json:"woorden"`
Personen [][5]any `json:"personen"`
AndereNamen [][5]any `json:"andere namen"`
Locaties [][5]any `json:"locaties"`
Organisaties [][5]any `json:"organisaties"`
}
type Scores struct {
Up []Item `json:"up"`
}
type Item struct {
Extras string `json:"extras"`
G2 float64 `json:"g2"`
N int `json:"n"`
PG2 float64 `json:"p_g2"`
Word string `json:"word"`
}
var (
sources = map[string]string{
"Algemeen": "algemeen",
"Amsterdam": "amsterdam",
"Groningen": "groningen",
"Literatuur": "literatuur",
"Vlaanderen": "vlaanderen",
}
parts = map[string]struct {
file string
re *regexp.Regexp
}{
"woorden": {"allewoorden", nil},
"personen": {"personen", nil},
"andere namen": {"overige-namen", nil},
"locaties": {"locaties", nil},
"organisaties": {"organisaties", regexp.MustCompile(`^(ANP|AT5)`)},
}
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
dagen = strings.Fields("zondag maandag dinsdag woensdag donderdag vrijdag zaterdag")
year int
week int
size int
x = e.ExitErr
)
func main() {
aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
}
var err error
year, err = strconv.Atoi(aa[0])
x(err)
week, err = strconv.Atoi(aa[1])
x(err)
size, err = strconv.Atoi(os.Args[2])
x(err)
if year < 1000 || year > 9999 {
x(fmt.Errorf("ongeldig year: %d", year))
}
if week < 1 || week > 53 {
x(fmt.Errorf("ongeldige week: %d", week))
}
start, first, last, names := dates()
max, sources := makeCounts(names)
data := &Data{
Year: year,
Week: week,
First: first,
Last: last,
Period: size,
Start: start,
Max: max,
Sources: sources,
Algemeen: makeParts("Algemeen"),
Groningen: makeParts("Groningen"),
Amsterdam: makeParts("Amsterdam"),
Literatuur: makeParts("Literatuur"),
Vlaanderen: makeParts("Vlaanderen"),
}
b, err := json.Marshal(data)
x(err)
fmt.Println(string(b))
}
func makeParts(source string) *Parts {
return &Parts{
Woorden: makeValues(source, "woorden"),
Personen: makeValues(source, "personen"),
AndereNamen: makeValues(source, "andere namen"),
Locaties: makeValues(source, "locaties"),
Organisaties: makeValues(source, "organisaties"),
}
}
func makeValues(source, part string) [][5]any {
v := make([][5]any, 0)
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d.score%d4.json",
year,
sources[source],
parts[part].file,
year,
week,
size)
b, err := os.ReadFile(filename)
x(err)
var scores Scores
x(json.Unmarshal(b, &scores))
lineno := 0
for _, item := range scores.Up {
if item.N > 0 {
lineno++
tags, _, _ := strings.Cut(item.Extras, "\t")
v = append(v, [5]any{item.N, item.Word, tags, item.G2, item.PG2})
if lineno == 20 {
break
}
}
}
return v
}
func makeCounts(names []string) (int, map[string]int) {
max := 0
counts := make(map[string]int)
x(os.Chdir("/net/corpora/nlnieuws"))
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
if !file.IsDir() {
continue
}
filename := file.Name()
if filename[0] < 'A' || filename[0] > 'Z' {
continue
}
count := 0
for _, name := range names {
files2, err := os.ReadDir(filename + "/" + name)
if err != nil {
continue
}
for _, f := range files2 {
if n := f.Name(); strings.HasSuffix(n, ".xml") {
count++
} else if strings.HasSuffix(n, ".skip") {
count--
}
}
}
counts[filename] = count
if count > max {
max = count
}
}
return max, counts
}
func dates() (start, first, last string, names []string) {
// 1 januari
t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC)
// zoek eerste donderdag
day := int(t.Weekday()) // 0 = zondag
donderdag := 4 - day
if donderdag < 0 {
donderdag = donderdag + 7
}
// schuif naar maandag voor eerste donderdag: donderdag - 3
// in de gewenste week: 7 * (week - 1)
t = t.AddDate(0, 0, donderdag-3+7*(week-1))
t2 := t.AddDate(0, 0, 6)
tStart := t.AddDate(0, 0, (1-size)*7)
names = make([]string, 0)
t3 := tStart
for range size {
y, w := t3.ISOWeek()
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
t3 = t3.AddDate(0, 0, 7)
}
t3 = tStart
for range 7 * size {
names = append(names, fmt.Sprintf("%d/%02d/%02d", t3.Year(), t3.Month(), t3.Day()))
t3 = t3.AddDate(0, 0, 1)
}
return makeDate(tStart), makeDate(t), makeDate(t2), names
}
func makeDate(d time.Time) string {
return fmt.Sprintf("%s %d %s %d", dagen[d.Weekday()][:2], d.Day(), maanden[int(d.Month())][:3], d.Year())
}

View File

@@ -26,6 +26,8 @@ var (
func main() {
suffix := os.Args[1]
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
x(err)
for _, dir := range dirs {
@@ -41,7 +43,7 @@ func main() {
for _, file := range files {
filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, suffix) {
addWeek(filename[5:12])
}
}

View File

@@ -201,8 +201,10 @@ done
data2json $ds 1 > ../json/$year/DATA-$ds-1.json
data2json $ds 4 > ../json/$year/DATA-$ds-4.json
dates2json > ../json/index1.json
dates2json > ../json/index4.json
data22json $ds 1 > ../json/$year/DATA-$ds-G2.json
dates2json 1.json > ../json/index1.json
dates2json 4.json > ../json/index4.json
dates2json G2.json > ../json/indexG2.json
# rechten bijwerken
chmod -R g+w /net/corpora/nlnieuws

209
www/app2.html Normal file
View File

@@ -0,0 +1,209 @@
<!DOCTYPE html>
<html>
<head>
<title id="title">Woord van de maand</title>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link rel="icon" href="favicon.ico" type="image/ico" />
<link rel="stylesheet" href="style.css" />
<link rel="stylesheet" type="text/css" href="tooltip.css" />
<script type="text/javascript" src="tooltip.js"></script>
<script type="text/javascript" src="app2.js" defer></script>
</head>
<body>
<div class="title">
<h1>Woord van de maand</h1>
</div>
<div class="form">
<form name="choice">
Wat wil je zien?
<br />
<ol>
<li>
<input type="radio" name="fWhat" id="fOpt1" value="opt1" onclick="opt(1)" />
<label for="fOpt1">Een specifieke bron</label>
</li>
<li>
<input type="radio" name="fWhat" id="fOpt2" value="opt2" onclick="opt(2)" />
<label for="fOpt2">Een specifiek onderdeel</label>
</li>
<li>
<input type="radio" name="fWhat" id="fOpt3" value="opt3" onclick="opt(3)" />
<label for="fOpt3">Een tijdreeks voor een bron en onderdeel</label>
</li>
</ol>
<div class="option" id="source">
bron:
<select name="source" id="fSource">
<option>Algemeen</option>
<option>Amsterdam</option>
<option>Groningen</option>
<option>Literatuur</option>
<option>Vlaanderen</option>
</select>
</div>
<div class="option" id="part">
onderdeel:
<select name="part" id="fPart">
<option>woorden</option>
<option>personen</option>
<option>locaties</option>
<option>organisaties</option>
<option>andere namen</option>
</select>
</div>
<div class="option" id="week">
week:
<input type="date" id="fDate" name="date" step="7" />
<span class="validity"></span>
</div>
<button type="button" onclick="kies()" id="fSubmit" disabled>&mdash; laat maar zien &mdash;</button>
</form>
</div>
<div class="title">
<h2 id="subtitle"></h2>
</div>
<div class="main" id="data"></div>
<div class="foot">
<h2>Bronnen</h2>
<table class="bron">
<tr>
<td>Algemeen</td>
<td class="bar">
<div id="NieuwsNL" style="width: 100%"></div>
</td>
<td><a href="https://nieuws.nl/">NieuwsNL</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="NOS"></div>
</td>
<td><a href="https://nos.nl/">NOS</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="NU"></div>
</td>
<td><a href="https://www.nu.nl/">NU</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="RO"></div>
</td>
<td><a href="https://reportersonline.nl/">Reporters Online</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="Sargasso"></div>
</td>
<td><a href="https://sargasso.nl/">Sargasso</a></td>
</tr>
<tr class="last">
<td></td>
<td class="bar">
<div id="Volkskrant"></div>
</td>
<td><a href="https://www.volkskrant.nl/">de Volkskrant</a></td>
</tr>
<tr class="first">
<td>Amsterdam</td>
<td class="bar">
<div id="AT5"></div>
</td>
<td><a href="https://www.at5.nl/">AT5</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="Parool"></div>
</td>
<td>
<a href="https://www.parool.nl/amsterdam/">Het Parool | Amsterdam</a>
</td>
</tr>
<tr class="last">
<td></td>
<td class="bar">
<div id="BuurtAdam"></div>
</td>
<td>
<a href="https://indebuurt.nl/amsterdam/">In de buurt | Amsterdam</a>
</td>
</tr>
<tr class="first">
<td>Groningen</td>
<td class="bar">
<div id="GG"></div>
</td>
<td>
<a href="https://gemeente.groningen.nl/nieuwsoverzicht">Gemeente Groningen</a>
</td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="BuurtGrn"></div>
</td>
<td>
<a href="https://indebuurt.nl/groningen/">In de buurt | Groningen</a>
</td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="Oog"></div>
</td>
<td><a href="https://www.oogtv.nl/">Oog</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="RTVNoord"></div>
</td>
<td><a href="https://www.rtvnoord.nl/">RTV Noord</a></td>
</tr>
<tr class="last">
<td></td>
<td class="bar">
<div id="Sikkom"></div>
</td>
<td><a href="https://sikkom.nl/">Sikkom</a></td>
</tr>
<tr class="first">
<td>Literatuur</td>
<td class="bar">
<div id="LitNL"></div>
</td>
<td>
<a href="https://www.literairnederland.nl/">Literair Nederland</a>
</td>
</tr>
<tr class="last">
<td></td>
<td class="bar">
<div id="Tzum"></div>
</td>
<td><a href="https://www.tzum.info/">Tzum</a></td>
</tr>
<tr class="first">
<td>Vlaanderen</td>
<td class="bar">
<div id="HLN"></div>
</td>
<td><a href="https://www.hln.be/">HLN</a></td>
</tr>
<tr>
<td></td>
<td class="bar">
<div id="VRT"></div>
</td>
<td><a href="https://www.vrt.be/vrtnws/nl/">VRT NWS</a></td>
</tr>
</table>
</div>
</body>
</html>

273
www/app2.js Normal file
View File

@@ -0,0 +1,273 @@
var dates
var datesNr
var countsWeek
var parts = ['woorden', 'personen', 'locaties', 'organisaties', 'andere namen']
var sources = ['Algemeen', 'Amsterdam', 'Groningen', 'Literatuur', 'Vlaanderen']
var data = new Map()
const idSource = document.getElementById('source')
const idPart = document.getElementById('part')
const idSubtitle = document.getElementById('subtitle')
const idData = document.getElementById('data')
const form1 = document.forms['choice']
const fWhat = form1['fWhat']
const fSource = form1['fSource']
const fPart = form1['fPart']
const fDate = form1['fDate']
const fSubmit = form1['fSubmit']
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms))
}
function setCounts(week) {
if (week != countsWeek) {
countsWeek = week
var max = data[week].max
var pp = Object.entries(data[week].sources)
for (var i in pp) {
document.getElementById(pp[i][0]).style.width =
(pp[i][1] / max) * 100 + '%'
}
}
}
function getJSON(url) {
return new Promise(function (resolve, reject) {
var xhr = new XMLHttpRequest()
xhr.open('get', '/alfa/wvdm/data/' + url, true)
xhr.responseType = 'json'
xhr.onload = function () {
var status = xhr.status
if (status == 200) {
resolve(xhr.response)
} else {
reject(status)
}
}
xhr.send()
})
}
function escape(s) {
return s
.replace('&', '&amp;')
.replace("'", '&apos;')
.replace('<', '&lt')
.replace('>', '&gt;')
}
function makeTD(title, values) {
const td = document.createElement('td')
const h3 = document.createElement('h3')
const txt = document.createTextNode(title)
h3.appendChild(txt)
td.appendChild(h3)
const tab = document.createElement('table')
var max
for (var i = 0; i < 20; i++) {
var value
if (i < values.length) {
value = values[i]
} else {
value = [0, '\xa0', '', '', '']
}
if (i == 0) {
max = value[3]
}
const tr = document.createElement('tr')
if (i < values.length) {
var t2 = ''
if (value[2]) {
tr.classList.add('tags')
t2 = '<br><small>' + escape(value[2]) + '</small>'
}
tr.setAttribute(
'onmouseover',
"tooltip.show('" + value[0] + ' \xa0 ' + escape(value[1]) + t2 + "')",
)
tr.setAttribute('onmouseout', 'tooltip.hide()')
}
const td1 = document.createElement('td')
const div1 = document.createElement('div')
pc = (value[3] / max) * 100
div1.style.width = pc + '%'
td1.appendChild(div1)
tr.appendChild(td1)
const td2 = document.createElement('td')
const txt2 = document.createTextNode(value[1])
td2.appendChild(txt2)
tr.appendChild(td2)
tab.appendChild(tr)
}
td.appendChild(tab)
return td
}
async function loadSource(source, week) {
if (!data.has(week)) {
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-G2.json',
)
}
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
const d = document.createElement('div')
const tab = document.createElement('table')
tab.classList.add('outer')
tr = document.createElement('tr')
parts.forEach(function (part) {
var values = data[week][source][part]
tr.appendChild(makeTD(part, values))
})
tab.appendChild(tr)
d.appendChild(tab)
idData.innerHTML = d.innerHTML
setCounts(week)
}
async function loadPart(part, week) {
if (!data.has(week)) {
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-G2.json',
)
}
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
const d = document.createElement('div')
const tab = document.createElement('table')
tab.classList.add('outer')
tr = document.createElement('tr')
sources.forEach(function (source) {
var values = data[week][source][part]
tr.appendChild(makeTD(source, values))
})
tab.appendChild(tr)
d.appendChild(tab)
idData.innerHTML = d.innerHTML
setCounts(week)
}
async function loadWeken(source, part) {
idSubtitle.innerHTML = source + ' — ' + part
const d = document.createElement('div')
const tab = document.createElement('table')
tab.classList.add('outer')
tr = document.createElement('tr')
for (var i = datesNr; i < datesNr + 10; i++) {
if (i < dates.length) {
var week = dates[i].week
if (!data.has(week)) {
data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-G2.json',
)
}
var values = data[week][source][part]
tr.appendChild(makeTD('t/m ' + data[week].last, values))
}
}
tab.appendChild(tr)
d.appendChild(tab)
idData.innerHTML = d.innerHTML
setCounts(dates[datesNr].week)
}
function locateWeek(date) {
if (date == '') {
datesNr = 0
return
}
var n = dates.length - 1
if (date < dates[n].first) {
datesNr = n
return
}
if (date > dates[0].last) {
datesNr = 0
return
}
var p1 = 0
var p2 = n
while (true) {
// fix voor als de weken niet aansluitend zijn (er ontbreken weken)
if (p1 > p2) {
datesNr = p2
return
}
var i = Math.floor((p1 + p2) / 2)
if (dates[i].first > date) {
p1 = i + 1 // dit klopt: lijst is van groot naar klein
} else if (dates[i].last < date) {
p2 = i - 1
} else {
datesNr = i
return
}
}
}
async function kies() {
let what = fWhat.value
let source = fSource.value
let part = fPart.value
let date = fDate.value
locateWeek(date)
idData.classList.add('fade')
idSubtitle.classList.add('fade')
await sleep(20)
if (what == 'opt1') {
await loadSource(source, dates[datesNr].week)
} else if (what == 'opt2') {
await loadPart(part, dates[datesNr].week)
} else {
await loadWeken(source, part)
}
idSubtitle.classList.remove('fade')
idData.classList.remove('fade')
}
function opt(i) {
fSubmit.disabled = false
if (i == 1) {
idSource.classList.remove('disabled')
fSource.disabled = false
idPart.classList.add('disabled')
fPart.disabled = true
}
if (i == 2) {
idSource.classList.add('disabled')
fSource.disabled = true
idPart.classList.remove('disabled')
fPart.disabled = false
}
if (i == 3) {
idSource.classList.remove('disabled')
fSource.disabled = false
idPart.classList.remove('disabled')
fPart.disabled = false
}
}
async function init() {
dates = await getJSON('indexG2.json')
datesNr = 0
fDate.setAttribute('min', dates[dates.length - 1].last)
fDate.setAttribute('max', dates[0].last)
loadSource('Algemeen', dates[datesNr].week)
}
init()