diff --git a/AT5/cmd/at5/at5.go b/AT5/cmd/at5/at5.go index ca9ba6b..2cd7a46 100644 --- a/AT5/cmd/at5/at5.go +++ b/AT5/cmd/at5/at5.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -26,7 +26,7 @@ type ItemT struct { } var ( - x = util.CheckErr + p = e.PanicErr agent = "AhrefsBot/7.0" ) @@ -36,22 +36,29 @@ func exists(filename string) bool { } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -59,7 +66,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -71,19 +78,23 @@ func main() { } filename := dirname + "/" + url.PathEscape(basename) - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = true + }() } - } diff --git a/AT5/cmd/metadata/metadata.go b/AT5/cmd/metadata/metadata.go index ad19fb1..9f86321 100644 --- a/AT5/cmd/metadata/metadata.go +++ b/AT5/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -16,7 +16,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr data = make(map[string][]string) location *time.Location ) @@ -51,18 +51,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/AT5/cmd/xml2txt/xml2txt.go b/AT5/cmd/xml2txt/xml2txt.go index 2a0d94a..1d32880 100644 --- a/AT5/cmd/xml2txt/xml2txt.go +++ b/AT5/cmd/xml2txt/xml2txt.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) @@ -55,18 +55,15 @@ func main() { x(err) var item Item x(xml.Unmarshal(b, &item), filename) - _, err = fp.WriteString(addEnd(fixSpace(item.Title))) - x(err) + x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body/p | //body/h2`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + x(fp.WriteString(addEnd(fixSpace(p.Content())))) } - x(err) x(fp.Close()) } } diff --git a/AT5/txt2corpus.sh b/AT5/txt2corpus.sh index 5bdbff2..0534355 100755 --- a/AT5/txt2corpus.sh +++ b/AT5/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/Amsterdam/amsterdam.go b/Amsterdam/amsterdam.go index 8a70075..76a5b7a 100644 --- a/Amsterdam/amsterdam.go +++ b/Amsterdam/amsterdam.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -29,7 +29,7 @@ type ItemT struct { } var ( - x = util.CheckErr + p = e.PanicErr agent = "AhrefsBot/7.0" // agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" ) @@ -40,22 +40,29 @@ func exists(filename string) bool { } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -63,26 +70,31 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) dirname := fmt.Sprintf("/net/corpora/nlnieuws/amsterdam/%d/%02d", t.Year(), int(t.Month())) if exists(dirname + "/lock") { continue } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/")) - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - doArticle(filename, item.Link, item.Title, t) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + doArticle(filename, item.Link, item.Title, t) + ok = true + }() } } @@ -93,58 +105,54 @@ func doArticle(filename string, url string, title string, timestamp time.Time) { time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) - x(err) + p(err) root := doc.Root() fp, err := os.Create(filename + ".txt") - x(err) + p(err) - _, err = fp.WriteString(addEnd(title)) - x(err) + p(fp.WriteString(addEnd(title))) count := 0 pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`) - x(err) - for _, p := range pp { - _, err = fp.WriteString(addEnd(p.Content())) - x(err) + p(err) + for _, p1 := range pp { + p(fp.WriteString(addEnd(p1.Content()))) count++ } ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`) - x(err) + p(err) for _, el := range ell { if n := el.Name(); n == "p" || n == "h3" { - _, err = fp.WriteString(addEnd(el.Content())) + p(fp.WriteString(addEnd(el.Content()))) count++ - x(err) } } - x(fp.Close()) + p(fp.Close()) - x(os.Chtimes(filename+".txt", timestamp, timestamp)) + p(os.Chtimes(filename+".txt", timestamp, timestamp)) if count == 0 { fp, err := os.Create(filename + ".debug.html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".debug.html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".debug.html", timestamp, timestamp)) } } diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go index 2634084..3177cdc 100644 --- a/GG/cmd/gg/gg.go +++ b/GG/cmd/gg/gg.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -29,8 +29,8 @@ type ItemT struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr // agent = "AhrefsBot/7.0" agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" ) @@ -48,26 +48,36 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://gemeente.groningen.nl/feed/rss/nieuws", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -75,7 +85,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -86,26 +96,28 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - if !doArticle(filename, item.Link, item.Title, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) + }() } } -func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool { +func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) { if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".html") @@ -119,18 +131,18 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) - x(err) + p(err) root := doc.Root() @@ -138,41 +150,37 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n `//div[contains(@class,"component-richtext")]/p` + ` | ` + `//div[contains(@class,"component-richtext")]/h2`) - x(err) + p(err) if len(ell) == 0 { _ = w(fmt.Errorf("empty: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "empty: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "empty: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".txt") - x(err) + p(err) - _, err = fp.WriteString(addEnd(fixSpace(title))) - x(err) + p(fp.WriteString(addEnd(fixSpace(title)))) for _, el := range ell { - _, err = fp.WriteString(addEnd(fixSpace(el.Content()))) - x(err) + p(fp.WriteString(addEnd(fixSpace(el.Content())))) } - x(fp.Close()) + p(fp.Close()) - x(os.Chtimes(filename+".txt", timestamp, timestamp)) + p(os.Chtimes(filename+".txt", timestamp, timestamp)) return true } diff --git a/GG/cmd/metadata/metadata.go b/GG/cmd/metadata/metadata.go index 938cc7a..1b0b3c5 100644 --- a/GG/cmd/metadata/metadata.go +++ b/GG/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -16,7 +16,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr data = make(map[string][]string) location *time.Location ) @@ -51,18 +51,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/GG/txt2corpus.sh b/GG/txt2corpus.sh index 628d432..25cf7d4 100755 --- a/GG/txt2corpus.sh +++ b/GG/txt2corpus.sh @@ -51,8 +51,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/NOS/cmd/json2txt/json2txt.go b/NOS/cmd/json2txt/json2txt.go index b898ed6..9f7b3b5 100644 --- a/NOS/cmd/json2txt/json2txt.go +++ b/NOS/cmd/json2txt/json2txt.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/json" "fmt" @@ -19,7 +19,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) @@ -57,17 +57,13 @@ func main() { var item Item x(json.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) - x(err) + x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) } for _, tag := range item.Tags { - _, err = fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)) - x(err) + x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) } - _, err = fp.WriteString(addEnd(fixSpace(item.Title))) - x(err) - _, err = fp.WriteString(fixSpace(item.Text)) - x(err) + x(fp.WriteString(addEnd(fixSpace(item.Title)))) + x(fp.WriteString(fixSpace(item.Text))) x(fp.Close()) } } diff --git a/NOS/cmd/metadata/metadata.go b/NOS/cmd/metadata/metadata.go index 1a7f802..f072704 100644 --- a/NOS/cmd/metadata/metadata.go +++ b/NOS/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -66,18 +66,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/NOS/cmd/nos/nos.go b/NOS/cmd/nos/nos.go index 9707638..4bb4a6a 100644 --- a/NOS/cmd/nos/nos.go +++ b/NOS/cmd/nos/nos.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -27,8 +27,8 @@ type ItemT struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr agent = "AhrefsBot/7.0" ) @@ -45,21 +45,31 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + resp, err := http.Get("https://feeds.nos.nl/nosnieuwsalgemeen") - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -74,7 +84,7 @@ func main() { break } } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -85,22 +95,24 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - if !doArticle(filename, item.Link, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, t, needUpdate) + }() } } @@ -118,15 +130,15 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) s := string(body) @@ -147,27 +159,24 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".json") - x(err) - _, err = fp.WriteString(s) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".json", timestamp, timestamp)) + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) return true } diff --git a/NOS/txt2corpus.sh b/NOS/txt2corpus.sh index 4f29212..4fbad73 100755 --- a/NOS/txt2corpus.sh +++ b/NOS/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/NU/cmd/metadata/metadata.go b/NU/cmd/metadata/metadata.go index 7c5b52b..cad5eaa 100644 --- a/NU/cmd/metadata/metadata.go +++ b/NU/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -56,18 +56,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go index 17254cf..c3b617c 100644 --- a/NU/cmd/nu/nu.go +++ b/NU/cmd/nu/nu.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bytes" "encoding/json" @@ -39,8 +39,8 @@ type GItem struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr agent = "AhrefsBot/7.0" ) @@ -57,21 +57,31 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + resp, err := http.Get("https://www.nu.nl/rss") - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -79,7 +89,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -90,22 +100,24 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - if !doArticle(filename, item.Link, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, t, needUpdate) + }() } } @@ -126,15 +138,15 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) s := string(body) ok := true @@ -154,38 +166,34 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".json") - x(err) - _, err = fp.WriteString(s) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".json", timestamp, timestamp)) + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) var doc Doc if err = json.Unmarshal([]byte(s), &doc); err != nil { _ = w(err, url) fp, err2 := os.Create(filename + ".err") - x(err2) - _, err2 = fmt.Fprintf(fp, "%s: %v\n%s\n", url, err, s) - x(err2) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err2) + p(fmt.Fprintf(fp, "%s: %v\n%s\n", url, err, s)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) return false } @@ -193,8 +201,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool cats := make([]string, 0) var buffer bytes.Buffer for _, i := range doc.Graph { - _, err = buffer.WriteString(html.UnescapeString(i.ArticleBody)) - x(err) + p(buffer.WriteString(html.UnescapeString(i.ArticleBody))) cats = append(cats, i.ArticleSection...) } text := buffer.String() @@ -224,20 +231,17 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool text = strings.Join(lines, "\n") + "\n" fp, err := os.Create(filename + ".txt") - x(err) + p(err) if len(cats) == 0 { - _, err := fmt.Fprintln(fp, "##META text cat =") - x(err) + p(fmt.Fprintln(fp, "##META text cat =")) } else { for _, cat := range cats { - _, err := fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) - x(err) + p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) } } - _, err = fp.WriteString(text) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".txt", timestamp, timestamp)) + p(fp.WriteString(text)) + p(fp.Close()) + p(os.Chtimes(filename+".txt", timestamp, timestamp)) } return true diff --git a/NU/txt2corpus.sh b/NU/txt2corpus.sh index 4c18246..d3051fc 100755 --- a/NU/txt2corpus.sh +++ b/NU/txt2corpus.sh @@ -51,8 +51,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/NieuwsNL/cmd/metadata/metadata.go b/NieuwsNL/cmd/metadata/metadata.go index b011d50..bbb6eba 100644 --- a/NieuwsNL/cmd/metadata/metadata.go +++ b/NieuwsNL/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -56,18 +56,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go index e7b9166..f8554ee 100644 --- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "bytes" "encoding/xml" @@ -30,8 +30,8 @@ type ItemT struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr agent = "AhrefsBot/7.0" ) @@ -48,26 +48,36 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://nieuws.nl/sitemap/news.xml", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -75,7 +85,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue @@ -85,22 +95,24 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - if !doArticle(filename, item.Link, item.Title, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) + }() } } @@ -118,18 +130,18 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) - x(err) + p(err) var buf bytes.Buffer fouten := make([]string, 0) @@ -138,10 +150,9 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n var cat string aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`) - x(err) + p(err) if len(aa) == 0 { - _, err = fmt.Fprintln(&buf, "##META text cat =") - x(err) + p(fmt.Fprintln(&buf, "##META text cat =")) _ = w(fmt.Errorf("no cat: %s", url)) // geen fout, maar waarschuwing als er meer fouten zijn fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) @@ -149,52 +160,46 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } else { for _, a := range aa { cat = strings.ReplaceAll(a.Content(), "\n", " ") - _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)) - x(err) + p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))) } } - _, err = buf.WriteString(addEnd(fixSpace(title))) - x(err) + p(buf.WriteString(addEnd(fixSpace(title)))) // oud: //div[@id="article-blocks"]//p pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`) - x(err) + p(err) if len(pp) == 0 { _ = w(fmt.Errorf("empty: %s", url)) // dit is echt fout fouten = append(fouten, fmt.Sprintf("empty: %s\n", url)) fp, err := os.Create(filename + ".err") - x(err) + p(err) for _, fout := range fouten { - _, err = fp.WriteString(fout) - x(err) + p(fp.WriteString(fout)) } - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false // echt fout } - for _, p := range pp { - _, err = buf.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + for _, p1 := range pp { + p(buf.WriteString(addEnd(fixSpace(p1.Content())))) } fp, err := os.Create(filename + ".txt") - x(err) - _, err = fp.Write(buf.Bytes()) - x(err) - x(fp.Close()) + p(err) + p(fp.Write(buf.Bytes())) + p(fp.Close()) - x(os.Chtimes(filename+".txt", timestamp, timestamp)) + p(os.Chtimes(filename+".txt", timestamp, timestamp)) return true } diff --git a/NieuwsNL/txt2corpus.sh b/NieuwsNL/txt2corpus.sh index 6a93640..eda4ea1 100755 --- a/NieuwsNL/txt2corpus.sh +++ b/NieuwsNL/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../../metadata 2> err -rm err +../../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/RO/cmd/metadata/metadata.go b/RO/cmd/metadata/metadata.go index eb04fd0..8f603c9 100644 --- a/RO/cmd/metadata/metadata.go +++ b/RO/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -66,18 +66,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/RO/cmd/ro/ro.go b/RO/cmd/ro/ro.go index 045531c..a992a07 100644 --- a/RO/cmd/ro/ro.go +++ b/RO/cmd/ro/ro.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -26,7 +26,7 @@ type ItemT struct { } var ( - x = util.CheckErr + p = e.PanicErr agent = "AhrefsBot/7.0" ) @@ -36,22 +36,29 @@ func exists(filename string) bool { } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -59,7 +66,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -71,19 +78,24 @@ func main() { } filename := dirname + "/" + url.PathEscape(basename) - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = true + }() } } diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 4e705bd..5d0f1fd 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -19,7 +19,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) @@ -57,11 +57,9 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) - x(err) + x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) } - _, err = fp.WriteString(addEnd(fixSpace(item.Title))) - x(err) + x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -73,10 +71,8 @@ func main() { pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + x(fp.WriteString(addEnd(fixSpace(p.Content())))) } - x(err) x(fp.Close()) } } diff --git a/RO/txt2corpus.sh b/RO/txt2corpus.sh index a2017e4..15859d5 100755 --- a/RO/txt2corpus.sh +++ b/RO/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/Sargasso/cmd/metadata/metadata.go b/Sargasso/cmd/metadata/metadata.go index 7b134e4..fe344f5 100644 --- a/Sargasso/cmd/metadata/metadata.go +++ b/Sargasso/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -66,18 +66,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/Sargasso/cmd/sargasso/sargasso.go b/Sargasso/cmd/sargasso/sargasso.go index 1428972..41e4d1f 100644 --- a/Sargasso/cmd/sargasso/sargasso.go +++ b/Sargasso/cmd/sargasso/sargasso.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -26,7 +26,7 @@ type ItemT struct { } var ( - x = util.CheckErr + p = e.PanicErr agent = "AhrefsBot/7.0" ) @@ -36,22 +36,29 @@ func exists(filename string) bool { } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -59,7 +66,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -71,19 +78,24 @@ func main() { } filename := dirname + "/" + url.PathEscape(basename) - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = true + }() } } diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go index a55ed6b..be6f9ed 100644 --- a/Sargasso/cmd/xml2txt/xml2txt.go +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -19,7 +19,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) @@ -57,21 +57,17 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) - x(err) + x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) } - _, err = fp.WriteString(addEnd(fixSpace(item.Title))) - x(err) + x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() pp, err := root.Search(`//body//p`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + x(fp.WriteString(addEnd(fixSpace(p.Content())))) } - x(err) x(fp.Close()) } } diff --git a/Sargasso/txt2corpus.sh b/Sargasso/txt2corpus.sh index ae709e6..4ae2b31 100755 --- a/Sargasso/txt2corpus.sh +++ b/Sargasso/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/Sikkom/cmd/metadata/metadata.go b/Sikkom/cmd/metadata/metadata.go index fd787db..5051d6c 100644 --- a/Sikkom/cmd/metadata/metadata.go +++ b/Sikkom/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -16,7 +16,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr data = make(map[string][]string) location *time.Location ) @@ -51,18 +51,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go index 1064ce6..25c42e7 100644 --- a/Sikkom/cmd/sikkom/sikkom.go +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -29,8 +29,8 @@ type ItemT struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr agent = "AhrefsBot/7.0" ) @@ -47,21 +47,31 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + resp, err := http.Get("https://www.sikkom.nl/api/feed/rss") - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -69,7 +79,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -80,22 +90,24 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) - if !doArticle(filename, item.Link, item.Title, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = doArticle(filename, item.Link, item.Title, t, needUpdate) + }() } } @@ -115,15 +127,15 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) s := string(body) @@ -144,69 +156,63 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".json") - x(err) - _, err = fp.WriteString(s) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".json", timestamp, timestamp)) + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) doc, err := gokogiri.ParseHtml(body) - x(err) + p(err) root := doc.Root() pp, err := root.Search(`//div[contains(@class,"article-page__body")]//p`) - x(err) + p(err) if len(pp) == 0 { _ = w(fmt.Errorf("empty: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "empty: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "empty: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(err) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err = os.Create(filename + ".txt") - x(err) + p(err) - _, err = fp.WriteString(addEnd(fixSpace(title))) - x(err) + p(fp.WriteString(addEnd(fixSpace(title)))) - for _, p := range pp { - _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + for _, p1 := range pp { + p(fp.WriteString(addEnd(fixSpace(p1.Content())))) } - x(fp.Close()) + p(fp.Close()) return true } diff --git a/Sikkom/txt2corpus.sh b/Sikkom/txt2corpus.sh index 342fda9..d18b92d 100755 --- a/Sikkom/txt2corpus.sh +++ b/Sikkom/txt2corpus.sh @@ -51,8 +51,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/Tzum/cmd/metadata/metadata.go b/Tzum/cmd/metadata/metadata.go index 8a6851f..46d704d 100644 --- a/Tzum/cmd/metadata/metadata.go +++ b/Tzum/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -66,18 +66,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/Tzum/cmd/tzum/tzum.go b/Tzum/cmd/tzum/tzum.go index ed7acc7..5e0e3c9 100644 --- a/Tzum/cmd/tzum/tzum.go +++ b/Tzum/cmd/tzum/tzum.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "encoding/xml" "fmt" @@ -26,7 +26,7 @@ type ItemT struct { } var ( - x = util.CheckErr + p = e.PanicErr agent = "AhrefsBot/7.0" ) @@ -36,22 +36,29 @@ func exists(filename string) bool { } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -59,7 +66,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } - x(err) + p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) if exists(dirname + "/lock") { @@ -71,19 +78,24 @@ func main() { } filename := dirname + "/" + url.PathEscape(basename) - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) + ok = true + }() } } diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 54b6384..87a3bd2 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "encoding/xml" "fmt" @@ -19,7 +19,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) ) @@ -57,11 +57,9 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) - x(err) + x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) } - _, err = fp.WriteString(addEnd(fixSpace(item.Title))) - x(err) + x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) root := doc.Root() @@ -70,11 +68,9 @@ func main() { for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { - _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) - x(err) + x(fp.WriteString(addEnd(fixSpace(p.Content())))) } } - x(err) x(fp.Close()) } } diff --git a/Tzum/txt2corpus.sh b/Tzum/txt2corpus.sh index f5840d3..280fabf 100755 --- a/Tzum/txt2corpus.sh +++ b/Tzum/txt2corpus.sh @@ -53,8 +53,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/VRT/cmd/metadata/metadata.go b/VRT/cmd/metadata/metadata.go index 0cc0262..43f29e8 100644 --- a/VRT/cmd/metadata/metadata.go +++ b/VRT/cmd/metadata/metadata.go @@ -1,7 +1,7 @@ package main import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "bufio" "encoding/xml" @@ -18,7 +18,7 @@ type Item struct { } var ( - x = util.CheckErr + x = e.ExitErr escape = html.EscapeString data = make(map[string][]string) location *time.Location @@ -56,18 +56,13 @@ func main() { i += strings.Index(s[i:], "<") fp, err := os.Create("xml/" + filename + ".tmp") x(err) - _, err = fp.WriteString(s[:i]) - x(err) - _, err = fp.WriteString("\n \n") - x(err) + x(fp.WriteString(s[:i])) + x(fp.WriteString("\n \n")) for _, m := range data[base] { - _, err = fp.WriteString(" " + m + "\n") - x(err) + x(fp.WriteString(" " + m + "\n")) } - _, err = fp.WriteString(" \n ") - x(err) - _, err = fp.WriteString(stripMeta(s[i:])) - x(err) + x(fp.WriteString(" \n ")) + x(fp.WriteString(stripMeta(s[i:]))) x(fp.Close()) x(os.Rename("xml/"+filename+".tmp", "xml/"+filename)) } diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index 32a1802..5159feb 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -1,8 +1,8 @@ package main import ( + e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" - "github.com/pebbe/util" "bytes" "encoding/xml" @@ -43,8 +43,8 @@ type LinkT struct { } var ( - x = util.CheckErr - w = util.WarnErr + p = e.PanicErr + w = e.WarnErr // agent = "AhrefsBot/7.0" agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" ) @@ -62,21 +62,31 @@ func fileDate(filename string) string { s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") + if i2 < i1 { + return "" + } return s[i1:i2] } func main() { + defer func() { + if e.Panicked() { + _ = recover() + os.Exit(1) + } + }() + resp, err := http.Get("https://www.vrt.be/vrtnws/nl.rss.headlines.xml") - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) var rss Rss - x(xml.Unmarshal(body, &rss)) + p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { - x(fmt.Errorf("len(rss.Items) == 0")) + p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { @@ -84,7 +94,7 @@ func main() { if err != nil { t, err = time.Parse(time.RFC1123, item.Published) } - x(err) + p(err) t2, err := time.Parse(time.RFC3339Nano, item.Updated) if err != nil { t2, _ = time.Parse(time.RFC1123, item.Updated) @@ -102,29 +112,31 @@ func main() { ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts - x(os.MkdirAll(dirname, 0777)) - fp, err := os.Create(filename + ".xml") - x(err) - _, err = fp.WriteString("\n\n") - x(err) - _, err = fmt.Fprintf(fp, "%d", t.Unix()) - x(err) - _, err = fp.Write(item.Data) - x(err) - _, err = fp.WriteString("\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".xml", t, t)) + p(os.MkdirAll(dirname, 0777)) + func() { + var ok bool + defer func() { + if !ok { + _ = os.Remove(filename + ".xml") + } + }() + fp, err := os.Create(filename + ".xml") + p(err) + p(fp.WriteString("\n\n")) + p(fmt.Fprintf(fp, "%d", t.Unix())) + p(fp.Write(item.Data)) + p(fp.WriteString("\n")) + p(fp.Close()) + p(os.Chtimes(filename+".xml", t, t)) - var link string - for _, l := range item.Link { - if l.Type == "text/html" { - link = l.Href + var link string + for _, l := range item.Link { + if l.Type == "text/html" { + link = l.Href + } } - } - if !doArticle(filename, link, item.Title.Text, item.Nstag, item.Nslabeltag, t, needUpdate) { - x(os.Remove(filename + ".xml")) - } + ok = doArticle(filename, link, item.Title.Text, item.Nstag, item.Nslabeltag, t, needUpdate) + }() } } @@ -142,15 +154,15 @@ func doArticle(filename string, url string, title string, tags []string, labels time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) - x(err) + p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) - x(err) + p(err) body, err := io.ReadAll(resp.Body) - x(err) - x(resp.Body.Close()) + p(err) + p(resp.Body.Close()) /* s := string(body) @@ -171,80 +183,71 @@ func doArticle(filename string, url string, title string, tags []string, labels _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") - x(err) - _, err = fmt.Fprintf(fp, "script jsonld not found: %s\n", url) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(err) + p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".json") - x(err) - _, err = fp.WriteString(s) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".json", timestamp, timestamp)) + p(err) + p(fp.WriteString(s)) + p(fp.Close()) + p(os.Chtimes(filename+".json", timestamp, timestamp)) */ var buf bytes.Buffer doc, err := gokogiri.ParseHtml(body) - x(err) + p(err) root := doc.Root() lnn, err := root.Search(`//head/link[@rel="canonical"]/@href`) - x(err) + p(err) for _, ln := range lnn { if strings.Contains(ln.String(), "/liveblog/") { fp, err := os.Create(filename + ".skip") - x(err) - _, err = fp.WriteString("liveblog\n") - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".skip", timestamp, timestamp)) + p(err) + p(fp.WriteString("liveblog\n")) + p(fp.Close()) + p(os.Chtimes(filename+".skip", timestamp, timestamp)) return true } } if len(tags) == 0 { - _, err = fmt.Fprintln(&buf, "##META text cat =") - x(err) + p(fmt.Fprintln(&buf, "##META text cat =")) } else { for _, tag := range tags { - _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag)) - x(err) + p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag))) } } if len(labels) == 0 { - _, err = fmt.Fprintln(&buf, "##META text label =") - x(err) + p(fmt.Fprintln(&buf, "##META text label =")) } else { for _, label := range labels { - _, err = fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label)) - x(err) + p(fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label))) } } _, err = buf.WriteString(addEnd(fixSpace(title))) - x(err) + p(err) fouten := make([]string, 0) found := false pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`) - x(err) - for _, p := range pp { - _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content()))) - x(err) + p(err) + for _, p1 := range pp { + p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content())))) found = true } if !found { @@ -257,10 +260,9 @@ func doArticle(filename string, url string, title string, tags []string, labels `//div[@data-sentry-component="ArticleText"]//*[contains(@class,"prose-article-body-r")]` + ` | ` + `//div[@data-sentry-component="ArticleTitle"]//h2`) - x(err) - for _, p := range pp { - _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content()))) - x(err) + p(err) + for _, p1 := range pp { + p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content())))) found = true } if !found { @@ -270,31 +272,27 @@ func doArticle(filename string, url string, title string, tags []string, labels if len(fouten) > 0 { fp, err := os.Create(filename + ".err") - x(err) + p(err) for _, fout := range fouten { - _, err = fp.WriteString(fout) - x(err) + p(fp.WriteString(fout)) } - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".err", timestamp, timestamp)) + p(fp.Close()) + p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") - x(err) - _, err = fp.Write(body) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".html", timestamp, timestamp)) + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".txt") - x(err) - _, err = fp.Write(buf.Bytes()) - x(err) - x(fp.Close()) - x(os.Chtimes(filename+".txt", timestamp, timestamp)) + p(err) + p(fp.Write(buf.Bytes())) + p(fp.Close()) + p(os.Chtimes(filename+".txt", timestamp, timestamp)) return true } diff --git a/VRT/txt2corpus.sh b/VRT/txt2corpus.sh index 37a2969..02b1d3e 100755 --- a/VRT/txt2corpus.sh +++ b/VRT/txt2corpus.sh @@ -51,8 +51,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata 2> err -rm err +../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null diff --git a/cmd/ISOWeek/ISOWeek.go b/cmd/ISOWeek/ISOWeek.go index fe518bb..9bbec96 100644 --- a/cmd/ISOWeek/ISOWeek.go +++ b/cmd/ISOWeek/ISOWeek.go @@ -23,7 +23,7 @@ Dit programma geeft wel de juiste uitvoer. */ import ( - "github.com/pebbe/util" + e "codeberg.org/pebbe/errors" "fmt" "os" @@ -32,7 +32,7 @@ import ( ) var ( - x = util.CheckErr + x = e.ExitErr ) func main() { diff --git a/go.mod b/go.mod index 2db2528..96db506 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,7 @@ module nlnieuws -go 1.25.0 +go 1.26.1 -require ( - github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 - github.com/pebbe/util v0.10.0 -) +require github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 + +require codeberg.org/pebbe/errors v0.3.0 diff --git a/go.sum b/go.sum index c3fbfa7..977948f 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,4 @@ +codeberg.org/pebbe/errors v0.3.0 h1:031dKFUvGzXxsb+ig7cKNpohHeQ38ghXDZrd7KehdbU= +codeberg.org/pebbe/errors v0.3.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4= github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw= -github.com/pebbe/util v0.10.0 h1:6GAxH8fo6HGOUq+JcPVsMuyvLS2mPwkrAx3/DCoYy1Y= -github.com/pebbe/util v0.10.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q= diff --git a/namen.sh b/namen.sh index 36f989b..4e62b80 100755 --- a/namen.sh +++ b/namen.sh @@ -15,6 +15,7 @@ overige opties: -n int : max aantal resultaten -p : gebruik pager -s : tel hits één keer per bericht + -v : verbose " exit } @@ -23,7 +24,8 @@ SINGLE=0 SELECT=0 USEPAGER=0 LIMIT=0 -while getopts 'sin:p' opt +VERBOSE=0 +while getopts 'sin:pv' opt do case "$opt" in i) @@ -38,6 +40,9 @@ do s) SINGLE=1 ;; + v) + VERBOSE=1 + ;; *) usage ;; @@ -66,7 +71,11 @@ else fi search () { - eval "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null $TAIL" + if [ $VERBOSE = 1 ] + then + echo "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL" + fi + eval "alto $@ '$EXPR' '$TEMPLATE' | $SORT | uniq -c | sort -nr 2> /dev/null$TAIL" } for i in "$@"