app2
This commit is contained in:
244
cmd/data22json/data22json.go
Normal file
244
cmd/data22json/data22json.go
Normal file
@@ -0,0 +1,244 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Data struct {
|
||||
Year int `json:"year"`
|
||||
Week int `json:"week"`
|
||||
First string `json:"first"`
|
||||
Last string `json:"last"`
|
||||
Period int `json:"period"`
|
||||
Start string `json:"start"`
|
||||
Max int `json:"max"`
|
||||
Sources map[string]int `json:"sources"`
|
||||
Algemeen *Parts `json:"Algemeen"`
|
||||
Groningen *Parts `json:"Groningen"`
|
||||
Amsterdam *Parts `json:"Amsterdam"`
|
||||
Literatuur *Parts `json:"Literatuur"`
|
||||
Vlaanderen *Parts `json:"Vlaanderen"`
|
||||
}
|
||||
|
||||
type Parts struct {
|
||||
Woorden [][5]any `json:"woorden"`
|
||||
Personen [][5]any `json:"personen"`
|
||||
AndereNamen [][5]any `json:"andere namen"`
|
||||
Locaties [][5]any `json:"locaties"`
|
||||
Organisaties [][5]any `json:"organisaties"`
|
||||
}
|
||||
|
||||
type Scores struct {
|
||||
Up []Item `json:"up"`
|
||||
}
|
||||
|
||||
type Item struct {
|
||||
Extras string `json:"extras"`
|
||||
G2 float64 `json:"g2"`
|
||||
N int `json:"n"`
|
||||
PG2 float64 `json:"p_g2"`
|
||||
Word string `json:"word"`
|
||||
}
|
||||
|
||||
var (
|
||||
sources = map[string]string{
|
||||
"Algemeen": "algemeen",
|
||||
"Amsterdam": "amsterdam",
|
||||
"Groningen": "groningen",
|
||||
"Literatuur": "literatuur",
|
||||
"Vlaanderen": "vlaanderen",
|
||||
}
|
||||
|
||||
parts = map[string]struct {
|
||||
file string
|
||||
re *regexp.Regexp
|
||||
}{
|
||||
"woorden": {"allewoorden", nil},
|
||||
"personen": {"personen", nil},
|
||||
"andere namen": {"overige-namen", nil},
|
||||
"locaties": {"locaties", nil},
|
||||
"organisaties": {"organisaties", regexp.MustCompile(`^(ANP|AT5)`)},
|
||||
}
|
||||
|
||||
maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
|
||||
dagen = strings.Fields("zondag maandag dinsdag woensdag donderdag vrijdag zaterdag")
|
||||
|
||||
year int
|
||||
week int
|
||||
size int
|
||||
|
||||
x = e.ExitErr
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
aa := strings.Split(os.Args[1], ".")
|
||||
if len(aa) != 2 {
|
||||
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
|
||||
}
|
||||
|
||||
var err error
|
||||
year, err = strconv.Atoi(aa[0])
|
||||
x(err)
|
||||
week, err = strconv.Atoi(aa[1])
|
||||
x(err)
|
||||
size, err = strconv.Atoi(os.Args[2])
|
||||
x(err)
|
||||
|
||||
if year < 1000 || year > 9999 {
|
||||
x(fmt.Errorf("ongeldig year: %d", year))
|
||||
}
|
||||
if week < 1 || week > 53 {
|
||||
x(fmt.Errorf("ongeldige week: %d", week))
|
||||
}
|
||||
|
||||
start, first, last, names := dates()
|
||||
|
||||
max, sources := makeCounts(names)
|
||||
data := &Data{
|
||||
Year: year,
|
||||
Week: week,
|
||||
First: first,
|
||||
Last: last,
|
||||
Period: size,
|
||||
Start: start,
|
||||
Max: max,
|
||||
Sources: sources,
|
||||
Algemeen: makeParts("Algemeen"),
|
||||
Groningen: makeParts("Groningen"),
|
||||
Amsterdam: makeParts("Amsterdam"),
|
||||
Literatuur: makeParts("Literatuur"),
|
||||
Vlaanderen: makeParts("Vlaanderen"),
|
||||
}
|
||||
|
||||
b, err := json.Marshal(data)
|
||||
x(err)
|
||||
fmt.Println(string(b))
|
||||
}
|
||||
|
||||
func makeParts(source string) *Parts {
|
||||
return &Parts{
|
||||
Woorden: makeValues(source, "woorden"),
|
||||
Personen: makeValues(source, "personen"),
|
||||
AndereNamen: makeValues(source, "andere namen"),
|
||||
Locaties: makeValues(source, "locaties"),
|
||||
Organisaties: makeValues(source, "organisaties"),
|
||||
}
|
||||
}
|
||||
|
||||
func makeValues(source, part string) [][5]any {
|
||||
v := make([][5]any, 0)
|
||||
|
||||
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d.score%d4.json",
|
||||
year,
|
||||
sources[source],
|
||||
parts[part].file,
|
||||
year,
|
||||
week,
|
||||
size)
|
||||
|
||||
b, err := os.ReadFile(filename)
|
||||
x(err)
|
||||
|
||||
var scores Scores
|
||||
x(json.Unmarshal(b, &scores))
|
||||
|
||||
lineno := 0
|
||||
for _, item := range scores.Up {
|
||||
if item.N > 0 {
|
||||
lineno++
|
||||
tags, _, _ := strings.Cut(item.Extras, "\t")
|
||||
v = append(v, [5]any{item.N, item.Word, tags, item.G2, item.PG2})
|
||||
if lineno == 20 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return v
|
||||
|
||||
}
|
||||
|
||||
func makeCounts(names []string) (int, map[string]int) {
|
||||
max := 0
|
||||
counts := make(map[string]int)
|
||||
x(os.Chdir("/net/corpora/nlnieuws"))
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
if !file.IsDir() {
|
||||
continue
|
||||
}
|
||||
filename := file.Name()
|
||||
if filename[0] < 'A' || filename[0] > 'Z' {
|
||||
continue
|
||||
}
|
||||
count := 0
|
||||
for _, name := range names {
|
||||
files2, err := os.ReadDir(filename + "/" + name)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, f := range files2 {
|
||||
if n := f.Name(); strings.HasSuffix(n, ".xml") {
|
||||
count++
|
||||
} else if strings.HasSuffix(n, ".skip") {
|
||||
count--
|
||||
}
|
||||
}
|
||||
}
|
||||
counts[filename] = count
|
||||
if count > max {
|
||||
max = count
|
||||
}
|
||||
}
|
||||
return max, counts
|
||||
}
|
||||
|
||||
func dates() (start, first, last string, names []string) {
|
||||
|
||||
// 1 januari
|
||||
t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
// zoek eerste donderdag
|
||||
day := int(t.Weekday()) // 0 = zondag
|
||||
donderdag := 4 - day
|
||||
if donderdag < 0 {
|
||||
donderdag = donderdag + 7
|
||||
}
|
||||
|
||||
// schuif naar maandag voor eerste donderdag: donderdag - 3
|
||||
// in de gewenste week: 7 * (week - 1)
|
||||
t = t.AddDate(0, 0, donderdag-3+7*(week-1))
|
||||
|
||||
t2 := t.AddDate(0, 0, 6)
|
||||
tStart := t.AddDate(0, 0, (1-size)*7)
|
||||
|
||||
names = make([]string, 0)
|
||||
t3 := tStart
|
||||
for range size {
|
||||
y, w := t3.ISOWeek()
|
||||
names = append(names, fmt.Sprintf("%d/w%02d", y, w))
|
||||
t3 = t3.AddDate(0, 0, 7)
|
||||
}
|
||||
t3 = tStart
|
||||
for range 7 * size {
|
||||
names = append(names, fmt.Sprintf("%d/%02d/%02d", t3.Year(), t3.Month(), t3.Day()))
|
||||
t3 = t3.AddDate(0, 0, 1)
|
||||
}
|
||||
|
||||
return makeDate(tStart), makeDate(t), makeDate(t2), names
|
||||
|
||||
}
|
||||
|
||||
func makeDate(d time.Time) string {
|
||||
return fmt.Sprintf("%s %d %s %d", dagen[d.Weekday()][:2], d.Day(), maanden[int(d.Month())][:3], d.Year())
|
||||
}
|
||||
@@ -26,6 +26,8 @@ var (
|
||||
|
||||
func main() {
|
||||
|
||||
suffix := os.Args[1]
|
||||
|
||||
dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
|
||||
x(err)
|
||||
for _, dir := range dirs {
|
||||
@@ -41,7 +43,7 @@ func main() {
|
||||
|
||||
for _, file := range files {
|
||||
filename := file.Name()
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
|
||||
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, suffix) {
|
||||
addWeek(filename[5:12])
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user