diff --git a/.gitignore b/.gitignore index ca33533..8cf80f8 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ Volkskrant/volkskrant VRT/metadata VRT/vrt bin/data2json +bin/data22json bin/dates2json bin/flush bin/items2count diff --git a/Makefile b/Makefile index 1c43ae7..e4946aa 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ all: make -C Volkskrant make -C VRT make bin/data2json + make bin/data22json make bin/dates2json make bin/flush make bin/items2count @@ -31,6 +32,9 @@ all: bin/data2json: cmd/data2json/*.go go build -o $@ $^ +bin/data22json: cmd/data22json/*.go + go build -o $@ $^ + bin/dates2json: cmd/dates2json/*.go go build -o $@ $^ diff --git a/cmd/data22json/data22json.go b/cmd/data22json/data22json.go new file mode 100644 index 0000000..e1362d4 --- /dev/null +++ b/cmd/data22json/data22json.go @@ -0,0 +1,244 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "encoding/json" + "fmt" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +type Data struct { + Year int `json:"year"` + Week int `json:"week"` + First string `json:"first"` + Last string `json:"last"` + Period int `json:"period"` + Start string `json:"start"` + Max int `json:"max"` + Sources map[string]int `json:"sources"` + Algemeen *Parts `json:"Algemeen"` + Groningen *Parts `json:"Groningen"` + Amsterdam *Parts `json:"Amsterdam"` + Literatuur *Parts `json:"Literatuur"` + Vlaanderen *Parts `json:"Vlaanderen"` +} + +type Parts struct { + Woorden [][5]any `json:"woorden"` + Personen [][5]any `json:"personen"` + AndereNamen [][5]any `json:"andere namen"` + Locaties [][5]any `json:"locaties"` + Organisaties [][5]any `json:"organisaties"` +} + +type Scores struct { + Up []Item `json:"up"` +} + +type Item struct { + Extras string `json:"extras"` + G2 float64 `json:"g2"` + N int `json:"n"` + PG2 float64 `json:"p_g2"` + Word string `json:"word"` +} + +var ( + sources = map[string]string{ + "Algemeen": "algemeen", + "Amsterdam": "amsterdam", + "Groningen": "groningen", + "Literatuur": "literatuur", + "Vlaanderen": "vlaanderen", + } + + parts = map[string]struct { + file string + re *regexp.Regexp + }{ + "woorden": {"allewoorden", nil}, + "personen": {"personen", nil}, + "andere namen": {"overige-namen", nil}, + "locaties": {"locaties", nil}, + "organisaties": {"organisaties", regexp.MustCompile(`^(ANP|AT5)`)}, + } + + maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december") + dagen = strings.Fields("zondag maandag dinsdag woensdag donderdag vrijdag zaterdag") + + year int + week int + size int + + x = e.ExitErr +) + +func main() { + + aa := strings.Split(os.Args[1], ".") + if len(aa) != 2 { + x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn")) + } + + var err error + year, err = strconv.Atoi(aa[0]) + x(err) + week, err = strconv.Atoi(aa[1]) + x(err) + size, err = strconv.Atoi(os.Args[2]) + x(err) + + if year < 1000 || year > 9999 { + x(fmt.Errorf("ongeldig year: %d", year)) + } + if week < 1 || week > 53 { + x(fmt.Errorf("ongeldige week: %d", week)) + } + + start, first, last, names := dates() + + max, sources := makeCounts(names) + data := &Data{ + Year: year, + Week: week, + First: first, + Last: last, + Period: size, + Start: start, + Max: max, + Sources: sources, + Algemeen: makeParts("Algemeen"), + Groningen: makeParts("Groningen"), + Amsterdam: makeParts("Amsterdam"), + Literatuur: makeParts("Literatuur"), + Vlaanderen: makeParts("Vlaanderen"), + } + + b, err := json.Marshal(data) + x(err) + fmt.Println(string(b)) +} + +func makeParts(source string) *Parts { + return &Parts{ + Woorden: makeValues(source, "woorden"), + Personen: makeValues(source, "personen"), + AndereNamen: makeValues(source, "andere namen"), + Locaties: makeValues(source, "locaties"), + Organisaties: makeValues(source, "organisaties"), + } +} + +func makeValues(source, part string) [][5]any { + v := make([][5]any, 0) + + filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d.score%d4.json", + year, + sources[source], + parts[part].file, + year, + week, + size) + + b, err := os.ReadFile(filename) + x(err) + + var scores Scores + x(json.Unmarshal(b, &scores)) + + lineno := 0 + for _, item := range scores.Up { + if item.N > 0 { + lineno++ + tags, _, _ := strings.Cut(item.Extras, "\t") + v = append(v, [5]any{item.N, item.Word, tags, item.G2, item.PG2}) + if lineno == 20 { + break + } + } + } + + return v + +} + +func makeCounts(names []string) (int, map[string]int) { + max := 0 + counts := make(map[string]int) + x(os.Chdir("/net/corpora/nlnieuws")) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + if !file.IsDir() { + continue + } + filename := file.Name() + if filename[0] < 'A' || filename[0] > 'Z' { + continue + } + count := 0 + for _, name := range names { + files2, err := os.ReadDir(filename + "/" + name) + if err != nil { + continue + } + for _, f := range files2 { + if n := f.Name(); strings.HasSuffix(n, ".xml") { + count++ + } else if strings.HasSuffix(n, ".skip") { + count-- + } + } + } + counts[filename] = count + if count > max { + max = count + } + } + return max, counts +} + +func dates() (start, first, last string, names []string) { + + // 1 januari + t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC) + + // zoek eerste donderdag + day := int(t.Weekday()) // 0 = zondag + donderdag := 4 - day + if donderdag < 0 { + donderdag = donderdag + 7 + } + + // schuif naar maandag voor eerste donderdag: donderdag - 3 + // in de gewenste week: 7 * (week - 1) + t = t.AddDate(0, 0, donderdag-3+7*(week-1)) + + t2 := t.AddDate(0, 0, 6) + tStart := t.AddDate(0, 0, (1-size)*7) + + names = make([]string, 0) + t3 := tStart + for range size { + y, w := t3.ISOWeek() + names = append(names, fmt.Sprintf("%d/w%02d", y, w)) + t3 = t3.AddDate(0, 0, 7) + } + t3 = tStart + for range 7 * size { + names = append(names, fmt.Sprintf("%d/%02d/%02d", t3.Year(), t3.Month(), t3.Day())) + t3 = t3.AddDate(0, 0, 1) + } + + return makeDate(tStart), makeDate(t), makeDate(t2), names + +} + +func makeDate(d time.Time) string { + return fmt.Sprintf("%s %d %s %d", dagen[d.Weekday()][:2], d.Day(), maanden[int(d.Month())][:3], d.Year()) +} diff --git a/cmd/dates2json/dates2json.go b/cmd/dates2json/dates2json.go index f5a5a9a..000881d 100644 --- a/cmd/dates2json/dates2json.go +++ b/cmd/dates2json/dates2json.go @@ -26,6 +26,8 @@ var ( func main() { + suffix := os.Args[1] + dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json") x(err) for _, dir := range dirs { @@ -41,7 +43,7 @@ func main() { for _, file := range files { filename := file.Name() - if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") { + if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, suffix) { addWeek(filename[5:12]) } } diff --git a/collect.sh b/collect.sh index 8e9d124..ca25f6d 100755 --- a/collect.sh +++ b/collect.sh @@ -201,8 +201,10 @@ done data2json $ds 1 > ../json/$year/DATA-$ds-1.json data2json $ds 4 > ../json/$year/DATA-$ds-4.json -dates2json > ../json/index1.json -dates2json > ../json/index4.json +data22json $ds 1 > ../json/$year/DATA-$ds-G2.json +dates2json 1.json > ../json/index1.json +dates2json 4.json > ../json/index4.json +dates2json G2.json > ../json/indexG2.json # rechten bijwerken chmod -R g+w /net/corpora/nlnieuws diff --git a/www/app2.html b/www/app2.html new file mode 100644 index 0000000..24b60f2 --- /dev/null +++ b/www/app2.html @@ -0,0 +1,209 @@ + + + + Woord van de maand + + + + + + + + + +
+

Woord van de maand

+
+
+
+ Wat wil je zien? +
+
    +
  1. + + +
  2. +
  3. + + +
  4. +
  5. + + +
  6. +
+
+ bron: + +
+
+ onderdeel: + +
+
+ week: + + +
+ +
+
+
+

+
+
+
+

Bronnen

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Algemeen +
+
NieuwsNL
+
+
NOS
+
+
NU
+
+
Reporters Online
+
+
Sargasso
+
+
de Volkskrant
Amsterdam +
+
AT5
+
+
+ Het Parool | Amsterdam +
+
+
+ In de buurt | Amsterdam +
Groningen +
+
+ Gemeente Groningen +
+
+
+ In de buurt | Groningen +
+
+
Oog
+
+
RTV Noord
+
+
Sikkom
Literatuur +
+
+ Literair Nederland +
+
+
Tzum
Vlaanderen +
+
HLN
+
+
VRT NWS
+
+ + diff --git a/www/app2.js b/www/app2.js new file mode 100644 index 0000000..4591148 --- /dev/null +++ b/www/app2.js @@ -0,0 +1,273 @@ +var dates +var datesNr +var countsWeek + +var parts = ['woorden', 'personen', 'locaties', 'organisaties', 'andere namen'] + +var sources = ['Algemeen', 'Amsterdam', 'Groningen', 'Literatuur', 'Vlaanderen'] + +var data = new Map() + +const idSource = document.getElementById('source') +const idPart = document.getElementById('part') +const idSubtitle = document.getElementById('subtitle') +const idData = document.getElementById('data') +const form1 = document.forms['choice'] +const fWhat = form1['fWhat'] +const fSource = form1['fSource'] +const fPart = form1['fPart'] +const fDate = form1['fDate'] +const fSubmit = form1['fSubmit'] + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +function setCounts(week) { + if (week != countsWeek) { + countsWeek = week + var max = data[week].max + var pp = Object.entries(data[week].sources) + for (var i in pp) { + document.getElementById(pp[i][0]).style.width = + (pp[i][1] / max) * 100 + '%' + } + } +} + +function getJSON(url) { + return new Promise(function (resolve, reject) { + var xhr = new XMLHttpRequest() + xhr.open('get', '/alfa/wvdm/data/' + url, true) + xhr.responseType = 'json' + xhr.onload = function () { + var status = xhr.status + if (status == 200) { + resolve(xhr.response) + } else { + reject(status) + } + } + xhr.send() + }) +} + +function escape(s) { + return s + .replace('&', '&') + .replace("'", ''') + .replace('<', '<') + .replace('>', '>') +} + +function makeTD(title, values) { + const td = document.createElement('td') + const h3 = document.createElement('h3') + const txt = document.createTextNode(title) + h3.appendChild(txt) + td.appendChild(h3) + const tab = document.createElement('table') + + var max + for (var i = 0; i < 20; i++) { + var value + if (i < values.length) { + value = values[i] + } else { + value = [0, '\xa0', '', '', ''] + } + if (i == 0) { + max = value[3] + } + const tr = document.createElement('tr') + if (i < values.length) { + var t2 = '' + if (value[2]) { + tr.classList.add('tags') + t2 = '
' + escape(value[2]) + '' + } + tr.setAttribute( + 'onmouseover', + "tooltip.show('" + value[0] + ' \xa0 ' + escape(value[1]) + t2 + "')", + ) + tr.setAttribute('onmouseout', 'tooltip.hide()') + } + const td1 = document.createElement('td') + const div1 = document.createElement('div') + pc = (value[3] / max) * 100 + div1.style.width = pc + '%' + td1.appendChild(div1) + tr.appendChild(td1) + const td2 = document.createElement('td') + const txt2 = document.createTextNode(value[1]) + td2.appendChild(txt2) + tr.appendChild(td2) + tab.appendChild(tr) + } + + td.appendChild(tab) + return td +} + +async function loadSource(source, week) { + if (!data.has(week)) { + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-G2.json', + ) + } + + idSubtitle.innerHTML = source + ' — t/m ' + data[week].last + + const d = document.createElement('div') + const tab = document.createElement('table') + tab.classList.add('outer') + tr = document.createElement('tr') + parts.forEach(function (part) { + var values = data[week][source][part] + tr.appendChild(makeTD(part, values)) + }) + tab.appendChild(tr) + d.appendChild(tab) + idData.innerHTML = d.innerHTML + setCounts(week) +} + +async function loadPart(part, week) { + if (!data.has(week)) { + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-G2.json', + ) + } + + idSubtitle.innerHTML = part + ' — t/m ' + data[week].last + + const d = document.createElement('div') + const tab = document.createElement('table') + tab.classList.add('outer') + tr = document.createElement('tr') + sources.forEach(function (source) { + var values = data[week][source][part] + tr.appendChild(makeTD(source, values)) + }) + tab.appendChild(tr) + d.appendChild(tab) + idData.innerHTML = d.innerHTML + setCounts(week) +} + +async function loadWeken(source, part) { + idSubtitle.innerHTML = source + ' — ' + part + + const d = document.createElement('div') + const tab = document.createElement('table') + tab.classList.add('outer') + tr = document.createElement('tr') + + for (var i = datesNr; i < datesNr + 10; i++) { + if (i < dates.length) { + var week = dates[i].week + if (!data.has(week)) { + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-G2.json', + ) + } + var values = data[week][source][part] + tr.appendChild(makeTD('t/m ' + data[week].last, values)) + } + } + + tab.appendChild(tr) + d.appendChild(tab) + idData.innerHTML = d.innerHTML + setCounts(dates[datesNr].week) +} + +function locateWeek(date) { + if (date == '') { + datesNr = 0 + return + } + var n = dates.length - 1 + if (date < dates[n].first) { + datesNr = n + return + } + if (date > dates[0].last) { + datesNr = 0 + return + } + var p1 = 0 + var p2 = n + while (true) { + // fix voor als de weken niet aansluitend zijn (er ontbreken weken) + if (p1 > p2) { + datesNr = p2 + return + } + var i = Math.floor((p1 + p2) / 2) + if (dates[i].first > date) { + p1 = i + 1 // dit klopt: lijst is van groot naar klein + } else if (dates[i].last < date) { + p2 = i - 1 + } else { + datesNr = i + return + } + } +} + +async function kies() { + let what = fWhat.value + let source = fSource.value + let part = fPart.value + let date = fDate.value + + locateWeek(date) + + idData.classList.add('fade') + idSubtitle.classList.add('fade') + await sleep(20) + + if (what == 'opt1') { + await loadSource(source, dates[datesNr].week) + } else if (what == 'opt2') { + await loadPart(part, dates[datesNr].week) + } else { + await loadWeken(source, part) + } + idSubtitle.classList.remove('fade') + idData.classList.remove('fade') +} + +function opt(i) { + fSubmit.disabled = false + + if (i == 1) { + idSource.classList.remove('disabled') + fSource.disabled = false + idPart.classList.add('disabled') + fPart.disabled = true + } + if (i == 2) { + idSource.classList.add('disabled') + fSource.disabled = true + idPart.classList.remove('disabled') + fPart.disabled = false + } + if (i == 3) { + idSource.classList.remove('disabled') + fSource.disabled = false + idPart.classList.remove('disabled') + fPart.disabled = false + } +} + +async function init() { + dates = await getJSON('indexG2.json') + datesNr = 0 + fDate.setAttribute('min', dates[dates.length - 1].last) + fDate.setAttribute('max', dates[0].last) + loadSource('Algemeen', dates[datesNr].week) +} + +init()