Files
nlnieuws/NOS/cmd/json2txt/json2txt.go
2026-05-29 17:22:10 +02:00

97 lines
2.0 KiB
Go

package main
import (
e "codeberg.org/pebbe/errors"
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
"encoding/json"
"fmt"
"os"
"regexp"
"strings"
"time"
)
type Graph struct {
Items []Item `json:"@graph"`
}
type Item struct {
Type any `json:"@type"`
Title string `json:"name"`
Text string `json:"articleBody"`
Cats []string `json:"articleSection"`
Tags []string `json:"keywords"`
}
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
)
func main() {
var ds string
switch len(os.Args) {
case 1:
t := time.Now().AddDate(0, 0, -2)
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-mm-dd"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := strings.ReplaceAll(ds, "-", "/")
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777))
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
filename := file.Name()
if !strings.HasSuffix(filename, ".json") {
continue
}
b, err := os.ReadFile(filename)
x(err)
fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt")
x(err)
item := getItem(b, filename)
for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
}
for _, tag := range item.Tags {
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
}
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
for _, line := range strings.SplitAfter(item.Text, "\n") {
x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
}
x(fp.Close())
}
}
func getItem(b []byte, filename string) Item {
var graph Graph
if json.Unmarshal(b, &graph) == nil {
if graph.Items != nil {
for _, item := range graph.Items {
switch v := item.Type.(type) {
case string:
if v == "NewsArticle" {
return item
}
}
}
}
}
var item Item
x(json.Unmarshal(b, &item), filename)
return item
}