package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "encoding/xml" "fmt" "os" "regexp" "strings" "time" ) type Item struct { Title string `xml:"title"` Text string `xml:"encoded"` Cats []string `xml:"category"` } var ( x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { var ds string switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp)) x(os.MkdirAll("out", 0777)) files, err := os.ReadDir(".") x(err) for _, file := range files { filename := file.Name() if !strings.HasSuffix(filename, ".xml") { continue } b, err := os.ReadFile(filename) x(err) fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { t := u.FixSpace(cat) if t == "Nieuws" { continue } x(fmt.Fprintf(fp, "##META text tag = %s\n", t)) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p`) x(err) for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } } x(fp.Close()) } }