package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "encoding/xml" "fmt" "os" "regexp" "strings" "time" ) type Item struct { Title string `xml:"title"` Text string `xml:"encoded"` Cats []string `xml:"category"` } var ( x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { var ds string switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp)) x(os.MkdirAll("out", 0777)) files, err := os.ReadDir(".") x(err) for _, file := range files { filename := file.Name() if !strings.HasSuffix(filename, ".xml") { continue } b, err := os.ReadFile(filename) x(err) fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body//p`) x(err) for _, p := range pp { x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } x(fp.Close()) } }