data2json
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -32,6 +32,7 @@ Tzum/tzum
|
||||
Tzum/xml2txt
|
||||
VRT/metadata
|
||||
VRT/vrt
|
||||
bin/data2json
|
||||
bin/ISOWeek
|
||||
bin/flush
|
||||
bin/items2count
|
||||
|
||||
4
Makefile
4
Makefile
@@ -13,6 +13,7 @@ all:
|
||||
make -C Sikkom
|
||||
make -C Tzum
|
||||
make -C VRT
|
||||
make bin/data2json
|
||||
make bin/flush
|
||||
make bin/ISOWeek
|
||||
make bin/items2count
|
||||
@@ -20,6 +21,9 @@ all:
|
||||
make bin/top20
|
||||
make bin/week2files
|
||||
|
||||
bin/data2json: cmd/data2json/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
bin/flush: cmd/flush/*.go
|
||||
go build -o $@ $^
|
||||
|
||||
|
||||
217
cmd/data2json/data2json.go
Normal file
217
cmd/data2json/data2json.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Data struct {
|
||||
Year int `json:"year"`
|
||||
Week int `json:"week"`
|
||||
First string `json:"first"`
|
||||
Last string `json:"last"`
|
||||
Period int `json:"period"`
|
||||
Start string `json:"start"`
|
||||
Algemeen *Parts `json:"Algemeen"`
|
||||
Groningen *Parts `json:"Groningen"`
|
||||
Amsterdam *Parts `json:"Amsterdam"`
|
||||
Literatuur *Parts `json:"Literatuur"`
|
||||
Vlaanderen *Parts `json:"Vlaanderen"`
|
||||
}
|
||||
|
||||
type Parts struct {
|
||||
NieuweNamen [][3]any `json:"nieuwe namen"`
|
||||
NieuweWoorden [][3]any `json:"nieuw woorden"`
|
||||
Personen [][3]any `json:"personen"`
|
||||
AndereNamen [][3]any `json:"andere namen"`
|
||||
Locaties [][3]any `json:"locaties"`
|
||||
Organisaties [][3]any `json:"organisaties"`
|
||||
}
|
||||
|
||||
type Value struct {
|
||||
Word string `json:"word"`
|
||||
Tags string `json:"tags"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
var (
|
||||
sources = map[string]string{
|
||||
"Algemeen": "algemeen",
|
||||
"Groningen": "groningen",
|
||||
"Amsterdam": "AT5",
|
||||
"Literatuur": "literatuur",
|
||||
"Vlaanderen": "VRT",
|
||||
}
|
||||
|
||||
parts = map[string]struct {
|
||||
file string
|
||||
suffix string
|
||||
}{
|
||||
"nieuwe namen": {"nieuwe-namen", ".t20"},
|
||||
"nieuwe woorden": {"nieuwe-woorden", ".t20"},
|
||||
"personen": {"personen", ""},
|
||||
"andere namen": {"overige-namen", ""},
|
||||
"locaties": {"locaties", ""},
|
||||
"organisaties": {"organisaties", ""},
|
||||
}
|
||||
|
||||
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
|
||||
dagen = strings.Fields("zondag maandag dinsdag woensdag donderdag vrijdag zaterdag")
|
||||
|
||||
year int
|
||||
week int
|
||||
size int
|
||||
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
aa := strings.Split(os.Args[1], "-")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
|
||||
}
|
||||
|
||||
var err error
|
||||
year, err = strconv.Atoi(aa[0])
|
||||
x(err)
|
||||
week, err = strconv.Atoi(aa[1])
|
||||
x(err)
|
||||
size, err = strconv.Atoi(os.Args[2])
|
||||
x(err)
|
||||
|
||||
if year < 1000 || year > 9999 {
|
||||
x(fmt.Errorf("ongeldig year: %d", year))
|
||||
}
|
||||
if week < 1 || week > 53 {
|
||||
x(fmt.Errorf("ongeldige week: %d", week))
|
||||
}
|
||||
|
||||
start, first, last := dates()
|
||||
|
||||
data := &Data{
|
||||
Year: year,
|
||||
Week: week,
|
||||
First: first,
|
||||
Last: last,
|
||||
Period: size,
|
||||
Start: start,
|
||||
Algemeen: makeParts("Algemeen"),
|
||||
Groningen: makeParts("Groningen"),
|
||||
Amsterdam: makeParts("Amsterdam"),
|
||||
Literatuur: makeParts("Literatuur"),
|
||||
Vlaanderen: makeParts("Vlaanderen"),
|
||||
}
|
||||
|
||||
b, err := json.Marshal(data)
|
||||
x(err)
|
||||
fmt.Println(string(b))
|
||||
|
||||
}
|
||||
|
||||
func makeParts(source string) *Parts {
|
||||
return &Parts{
|
||||
NieuweNamen: makeValues(source, "nieuwe namen"),
|
||||
NieuweWoorden: makeValues(source, "nieuwe woorden"),
|
||||
Personen: makeValues(source, "personen"),
|
||||
AndereNamen: makeValues(source, "andere namen"),
|
||||
Locaties: makeValues(source, "locaties"),
|
||||
Organisaties: makeValues(source, "organisaties"),
|
||||
}
|
||||
}
|
||||
|
||||
func makeValues(source, part string) [][3]any {
|
||||
v := make([][3]any, 0)
|
||||
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
|
||||
sources[source],
|
||||
parts[part].file,
|
||||
year,
|
||||
week,
|
||||
size,
|
||||
parts[part].suffix)
|
||||
|
||||
fp, err := os.Open(filename)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
lineno := 0
|
||||
for scanner.Scan() {
|
||||
lineno++
|
||||
line := scanner.Text()
|
||||
aa := strings.Split(line, "\t")
|
||||
count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
|
||||
x(err)
|
||||
word := aa[1]
|
||||
var tags string
|
||||
if len(aa) > 2 {
|
||||
tags = aa[2]
|
||||
}
|
||||
v = append(v, [3]any{count, word, tags})
|
||||
if lineno == 20 {
|
||||
break
|
||||
}
|
||||
}
|
||||
x(scanner.Err())
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
func dates() (start, first, last string) {
|
||||
// 15 januari van het jaar
|
||||
t := time.Date(year, 1, 15, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
// eerste gok
|
||||
t = t.AddDate(0, 0, 7*week-14)
|
||||
|
||||
// zoek juiste week
|
||||
var y, w int
|
||||
for {
|
||||
y, w = t.ISOWeek()
|
||||
if y < year {
|
||||
t = t.AddDate(0, 12, 0)
|
||||
continue
|
||||
}
|
||||
if y > year {
|
||||
t = t.AddDate(0, -12, 0)
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
for {
|
||||
y, w = t.ISOWeek()
|
||||
if w < week {
|
||||
t = t.AddDate(0, 0, 7)
|
||||
continue
|
||||
}
|
||||
if w > week {
|
||||
t = t.AddDate(0, 0, -7)
|
||||
}
|
||||
break
|
||||
}
|
||||
if y != year {
|
||||
x(fmt.Errorf("ongeldige combinatie van week/jaar: %d/%d", week, year))
|
||||
}
|
||||
|
||||
// zoek begin van de week
|
||||
d := int(t.Weekday())
|
||||
if d == 0 {
|
||||
d = 7
|
||||
}
|
||||
tFirst := t.AddDate(0, 0, 1-d)
|
||||
tLast := tFirst.AddDate(0, 0, 6)
|
||||
tStart := tFirst.AddDate(0, 0, (1-size)*7)
|
||||
|
||||
return makeDate(tStart), makeDate(tFirst), makeDate(tLast)
|
||||
|
||||
}
|
||||
|
||||
func makeDate(d time.Time) string {
|
||||
return fmt.Sprintf("%s %d %s %d", dagen[d.Weekday()][:2], d.Day(), maanden[int(d.Month())][:3], d.Year())
|
||||
}
|
||||
@@ -147,5 +147,9 @@ do
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-nieuwe-adjww-extra-$ds-$i
|
||||
|
||||
|
||||
data2json $ds $i > DATA-$d-$i.json
|
||||
|
||||
done
|
||||
done
|
||||
|
||||
1
www/DATA-2026-15-1.json
Normal file
1
www/DATA-2026-15-1.json
Normal file
File diff suppressed because one or more lines are too long
1
www/DATA-2026-15-4.json
Normal file
1
www/DATA-2026-15-4.json
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user