95 lines
1.9 KiB
Go
95 lines
1.9 KiB
Go
package main
|
|
|
|
import (
|
|
e "codeberg.org/pebbe/errors"
|
|
|
|
u "git.web.rug.nl/p209327/nlnieuws/internal/util"
|
|
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type Graph struct {
|
|
Items []Item `json:"@graph"`
|
|
}
|
|
|
|
type Item struct {
|
|
Type any `json:"@type"`
|
|
Title string `json:"name"`
|
|
Text string `json:"articleBody"`
|
|
Cats []string `json:"articleSection"`
|
|
Tags []string `json:"keywords"`
|
|
}
|
|
|
|
var (
|
|
x = e.ExitErr
|
|
|
|
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
|
|
)
|
|
|
|
func main() {
|
|
|
|
var ds string
|
|
switch len(os.Args) {
|
|
case 1:
|
|
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
|
|
ds = fmt.Sprintf("%d-%02d", year, week)
|
|
case 2:
|
|
if !reYearWeek.MatchString(os.Args[1]) {
|
|
x(fmt.Errorf("arg must be yyyy-ww"))
|
|
}
|
|
ds = os.Args[1]
|
|
default:
|
|
x(fmt.Errorf("too many arguments"))
|
|
}
|
|
dp := ds[:4] + "/" + ds[5:]
|
|
|
|
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
|
|
x(os.MkdirAll("out", 0777))
|
|
files, err := os.ReadDir(".")
|
|
x(err)
|
|
for _, file := range files {
|
|
filename := file.Name()
|
|
if !strings.HasSuffix(filename, ".json") {
|
|
continue
|
|
}
|
|
b, err := os.ReadFile(filename)
|
|
x(err)
|
|
fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt")
|
|
x(err)
|
|
item := getItem(b, filename)
|
|
for _, cat := range item.Cats {
|
|
x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
|
|
}
|
|
for _, tag := range item.Tags {
|
|
x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
|
|
}
|
|
x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
|
|
x(fp.WriteString(u.FixSpace(item.Text)))
|
|
x(fp.Close())
|
|
}
|
|
}
|
|
|
|
func getItem(b []byte, filename string) Item {
|
|
var graph Graph
|
|
if json.Unmarshal(b, &graph) == nil {
|
|
if graph.Items != nil {
|
|
for _, item := range graph.Items {
|
|
switch v := item.Type.(type) {
|
|
case string:
|
|
if v == "NewsArticle" {
|
|
return item
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
var item Item
|
|
x(json.Unmarshal(b, &item), filename)
|
|
return item
|
|
}
|