diff --git a/.gitignore b/.gitignore index ca33533..8cf80f8 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ Volkskrant/volkskrant VRT/metadata VRT/vrt bin/data2json +bin/data22json bin/dates2json bin/flush bin/items2count diff --git a/Makefile b/Makefile index 1c43ae7..e4946aa 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ all: make -C Volkskrant make -C VRT make bin/data2json + make bin/data22json make bin/dates2json make bin/flush make bin/items2count @@ -31,6 +32,9 @@ all: bin/data2json: cmd/data2json/*.go go build -o $@ $^ +bin/data22json: cmd/data22json/*.go + go build -o $@ $^ + bin/dates2json: cmd/dates2json/*.go go build -o $@ $^ diff --git a/cmd/data22json/data22json.go b/cmd/data22json/data22json.go new file mode 100644 index 0000000..e1362d4 --- /dev/null +++ b/cmd/data22json/data22json.go @@ -0,0 +1,244 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "encoding/json" + "fmt" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +type Data struct { + Year int `json:"year"` + Week int `json:"week"` + First string `json:"first"` + Last string `json:"last"` + Period int `json:"period"` + Start string `json:"start"` + Max int `json:"max"` + Sources map[string]int `json:"sources"` + Algemeen *Parts `json:"Algemeen"` + Groningen *Parts `json:"Groningen"` + Amsterdam *Parts `json:"Amsterdam"` + Literatuur *Parts `json:"Literatuur"` + Vlaanderen *Parts `json:"Vlaanderen"` +} + +type Parts struct { + Woorden [][5]any `json:"woorden"` + Personen [][5]any `json:"personen"` + AndereNamen [][5]any `json:"andere namen"` + Locaties [][5]any `json:"locaties"` + Organisaties [][5]any `json:"organisaties"` +} + +type Scores struct { + Up []Item `json:"up"` +} + +type Item struct { + Extras string `json:"extras"` + G2 float64 `json:"g2"` + N int `json:"n"` + PG2 float64 `json:"p_g2"` + Word string `json:"word"` +} + +var ( + sources = map[string]string{ + "Algemeen": "algemeen", + "Amsterdam": "amsterdam", + "Groningen": "groningen", + "Literatuur": "literatuur", + "Vlaanderen": "vlaanderen", + } + + parts = map[string]struct { + file string + re *regexp.Regexp + }{ + "woorden": {"allewoorden", nil}, + "personen": {"personen", nil}, + "andere namen": {"overige-namen", nil}, + "locaties": {"locaties", nil}, + "organisaties": {"organisaties", regexp.MustCompile(`^(ANP|AT5)`)}, + } + + maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december") + dagen = strings.Fields("zondag maandag dinsdag woensdag donderdag vrijdag zaterdag") + + year int + week int + size int + + x = e.ExitErr +) + +func main() { + + aa := strings.Split(os.Args[1], ".") + if len(aa) != 2 { + x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn")) + } + + var err error + year, err = strconv.Atoi(aa[0]) + x(err) + week, err = strconv.Atoi(aa[1]) + x(err) + size, err = strconv.Atoi(os.Args[2]) + x(err) + + if year < 1000 || year > 9999 { + x(fmt.Errorf("ongeldig year: %d", year)) + } + if week < 1 || week > 53 { + x(fmt.Errorf("ongeldige week: %d", week)) + } + + start, first, last, names := dates() + + max, sources := makeCounts(names) + data := &Data{ + Year: year, + Week: week, + First: first, + Last: last, + Period: size, + Start: start, + Max: max, + Sources: sources, + Algemeen: makeParts("Algemeen"), + Groningen: makeParts("Groningen"), + Amsterdam: makeParts("Amsterdam"), + Literatuur: makeParts("Literatuur"), + Vlaanderen: makeParts("Vlaanderen"), + } + + b, err := json.Marshal(data) + x(err) + fmt.Println(string(b)) +} + +func makeParts(source string) *Parts { + return &Parts{ + Woorden: makeValues(source, "woorden"), + Personen: makeValues(source, "personen"), + AndereNamen: makeValues(source, "andere namen"), + Locaties: makeValues(source, "locaties"), + Organisaties: makeValues(source, "organisaties"), + } +} + +func makeValues(source, part string) [][5]any { + v := make([][5]any, 0) + + filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d.score%d4.json", + year, + sources[source], + parts[part].file, + year, + week, + size) + + b, err := os.ReadFile(filename) + x(err) + + var scores Scores + x(json.Unmarshal(b, &scores)) + + lineno := 0 + for _, item := range scores.Up { + if item.N > 0 { + lineno++ + tags, _, _ := strings.Cut(item.Extras, "\t") + v = append(v, [5]any{item.N, item.Word, tags, item.G2, item.PG2}) + if lineno == 20 { + break + } + } + } + + return v + +} + +func makeCounts(names []string) (int, map[string]int) { + max := 0 + counts := make(map[string]int) + x(os.Chdir("/net/corpora/nlnieuws")) + files, err := os.ReadDir(".") + x(err) + for _, file := range files { + if !file.IsDir() { + continue + } + filename := file.Name() + if filename[0] < 'A' || filename[0] > 'Z' { + continue + } + count := 0 + for _, name := range names { + files2, err := os.ReadDir(filename + "/" + name) + if err != nil { + continue + } + for _, f := range files2 { + if n := f.Name(); strings.HasSuffix(n, ".xml") { + count++ + } else if strings.HasSuffix(n, ".skip") { + count-- + } + } + } + counts[filename] = count + if count > max { + max = count + } + } + return max, counts +} + +func dates() (start, first, last string, names []string) { + + // 1 januari + t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC) + + // zoek eerste donderdag + day := int(t.Weekday()) // 0 = zondag + donderdag := 4 - day + if donderdag < 0 { + donderdag = donderdag + 7 + } + + // schuif naar maandag voor eerste donderdag: donderdag - 3 + // in de gewenste week: 7 * (week - 1) + t = t.AddDate(0, 0, donderdag-3+7*(week-1)) + + t2 := t.AddDate(0, 0, 6) + tStart := t.AddDate(0, 0, (1-size)*7) + + names = make([]string, 0) + t3 := tStart + for range size { + y, w := t3.ISOWeek() + names = append(names, fmt.Sprintf("%d/w%02d", y, w)) + t3 = t3.AddDate(0, 0, 7) + } + t3 = tStart + for range 7 * size { + names = append(names, fmt.Sprintf("%d/%02d/%02d", t3.Year(), t3.Month(), t3.Day())) + t3 = t3.AddDate(0, 0, 1) + } + + return makeDate(tStart), makeDate(t), makeDate(t2), names + +} + +func makeDate(d time.Time) string { + return fmt.Sprintf("%s %d %s %d", dagen[d.Weekday()][:2], d.Day(), maanden[int(d.Month())][:3], d.Year()) +} diff --git a/cmd/dates2json/dates2json.go b/cmd/dates2json/dates2json.go index f5a5a9a..000881d 100644 --- a/cmd/dates2json/dates2json.go +++ b/cmd/dates2json/dates2json.go @@ -26,6 +26,8 @@ var ( func main() { + suffix := os.Args[1] + dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json") x(err) for _, dir := range dirs { @@ -41,7 +43,7 @@ func main() { for _, file := range files { filename := file.Name() - if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") { + if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, suffix) { addWeek(filename[5:12]) } } diff --git a/collect.sh b/collect.sh index 8e9d124..ca25f6d 100755 --- a/collect.sh +++ b/collect.sh @@ -201,8 +201,10 @@ done data2json $ds 1 > ../json/$year/DATA-$ds-1.json data2json $ds 4 > ../json/$year/DATA-$ds-4.json -dates2json > ../json/index1.json -dates2json > ../json/index4.json +data22json $ds 1 > ../json/$year/DATA-$ds-G2.json +dates2json 1.json > ../json/index1.json +dates2json 4.json > ../json/index4.json +dates2json G2.json > ../json/indexG2.json # rechten bijwerken chmod -R g+w /net/corpora/nlnieuws diff --git a/www/app2.html b/www/app2.html new file mode 100644 index 0000000..24b60f2 --- /dev/null +++ b/www/app2.html @@ -0,0 +1,209 @@ + + +
+| Algemeen | ++ + | +NieuwsNL | +
| + | + + | +NOS | +
| + | + + | +NU | +
| + | + + | +Reporters Online | +
| + | + + | +Sargasso | +
| + | + + | +de Volkskrant | +
| Amsterdam | ++ + | +AT5 | +
| + | + + | ++ Het Parool | Amsterdam + | +
| + | + + | ++ In de buurt | Amsterdam + | +
| Groningen | ++ + | ++ Gemeente Groningen + | +
| + | + + | ++ In de buurt | Groningen + | +
| + | + + | +Oog | +
| + | + + | +RTV Noord | +
| + | + + | +Sikkom | +
| Literatuur | ++ + | ++ Literair Nederland + | +
| + | + + | +Tzum | +
| Vlaanderen | ++ + | +HLN | +
| + | + + | +VRT NWS | +