Files
nlnieuws/NOS/cmd/json2txt/json2txt.go
Peter Kleiweg 36f051a8a9 first commit
2026-03-02 15:34:37 +01:00

94 lines
1.8 KiB
Go

package main
import (
"github.com/pebbe/util"
"encoding/json"
"fmt"
"os"
"regexp"
"strings"
"time"
)
type Item struct {
Title string `json:"name"`
Text string `json:"articleBody"`
Cats []string `json:"articleSection"`
Tags []string `json:"keywords"`
}
var (
x = util.CheckErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
)
func main() {
var ds string
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777))
files, err := os.ReadDir(".")
x(err)
for _, file := range files {
filename := file.Name()
if !strings.HasSuffix(filename, ".json") {
continue
}
b, err := os.ReadFile(filename)
x(err)
fp, err := os.Create("out/" + filename[:len(filename)-5] + ".txt")
x(err)
var item Item
x(json.Unmarshal(b, &item))
for _, cat := range item.Cats {
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
x(err)
}
for _, cat := range item.Tags {
_, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat)
x(err)
}
_, err = fp.WriteString(addEnd(item.Title))
x(err)
_, err = fp.WriteString(item.Text)
x(err)
x(fp.Close())
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}