package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" "github.com/pebbe/textcat/v2" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "bytes" "encoding/xml" "fmt" "os" "regexp" "strings" "time" ) type Item struct { Title string `xml:"title"` Text string `xml:"encoded"` Cats []string `xml:"category"` } var ( x = e.ExitErr w = e.WarnErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { tc := textcat.NewTextCat() tc.EnableLanguages("en.utf8", "nl.utf8") var ds string switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp)) x(os.MkdirAll("out", 0777)) files, err := os.ReadDir(".") x(err) for _, file := range files { filename := file.Name() if !strings.HasSuffix(filename, ".xml") { continue } b, err := os.ReadFile(filename) x(err) var buf bytes.Buffer var item Item x(xml.Unmarshal(b, &item)) x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() divs, err := root.Search(`//div[@class="donatieformlinks"]`) x(err) for _, div := range divs { div.Remove() } pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) x(err) for _, p := range pp { x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content())))) } text := buf.String() langs, err := tc.Classify(text) if err != nil { _ = w(fmt.Errorf("language: %v in %s", err, filename)) continue } if len(langs) != 1 || langs[0] != "nl.utf8" { _ = w(fmt.Errorf("language: %v in %s", langs, filename)) continue } fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) for _, cat := range item.Cats { x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat))) } x(fp.WriteString(text)) x(fp.Close()) } }