package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" "encoding/xml" "fmt" "os" "regexp" "strings" "time" ) type Item struct { Title string `xml:"title"` Text string `xml:"encoded"` Cats []string `xml:"category"` } var ( w = e.WarnErr x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) func main() { var ds string switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() ds = fmt.Sprintf("%d-%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { x(fmt.Errorf("arg must be yyyy-ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } dp := ds[:4] + "/" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp)) x(os.MkdirAll("out", 0777)) files, err := os.ReadDir(".") x(err) for _, file := range files { filename := file.Name() if !strings.HasSuffix(filename, ".xml") { continue } b, err := os.ReadFile(filename) x(err) fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) } x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body//p`) x(err) if len(pp) == 0 { pp, err = root.Search(`//body`) x(err) } if len(pp) == 0 { _ = w(fmt.Errorf("empty: %s", filename)) } for _, p := range pp { x(fp.WriteString(addEnd(fixSpace(p.Content())))) } x(fp.Close()) } } func addEnd(s string) string { s = strings.TrimSpace(s) n := len(s) if n == 0 { return "" } if n > 0 { if strings.ContainsAny(s[n-1:], ".!?") { return s + "\n" } } if n > 1 { s2 := s[n-2:] if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { return s + "\n" } } return s + ".\n" } func fixSpace(s string) string { return strings.Join(strings.Fields(s), " ") }