package main import ( e "codeberg.org/pebbe/errors" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "encoding/json" "fmt" "os" "regexp" "strings" "time" ) type Graph struct { Items []Item `json:"@graph"` } type Item struct { Type any `json:"@type"` Title string `json:"name"` Text string `json:"articleBody"` Cats []string `json:"articleSection"` Tags []string `json:"keywords"` } var ( x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`) ) func main() { var ds string switch len(os.Args) { case 1: t := time.Now().AddDate(0, 0, -2) ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day()) case 2: if !reYearWeek.MatchString(os.Args[1]) { x(fmt.Errorf("arg must be yyyy-mm-dd")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } dp := strings.ReplaceAll(ds, "-", "/") x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp)) x(os.MkdirAll("out", 0777)) files, err := os.ReadDir(".") x(err) for _, file := range files { filename := file.Name() if !strings.HasSuffix(filename, ".json") { continue } b, err := os.ReadFile(filename) x(err) fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt") x(err) item := getItem(b, filename) for _, cat := range item.Cats { x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat))) } for _, tag := range item.Tags { x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) for _, line := range strings.SplitAfter(item.Text, "\n") { x(fp.WriteString(u.AddEnd(u.FixSpace(line, true)))) } x(fp.Close()) } } func getItem(b []byte, filename string) Item { var graph Graph if json.Unmarshal(b, &graph) == nil { if graph.Items != nil { for _, item := range graph.Items { switch v := item.Type.(type) { case string: if v == "NewsArticle" { return item } } } } } var item Item x(json.Unmarshal(b, &item), filename) return item }