diff --git a/AT5/cmd/xml2txt/xml2txt.go b/AT5/cmd/xml2txt/xml2txt.go index 9b4a6fa..2a0d94a 100644 --- a/AT5/cmd/xml2txt/xml2txt.go +++ b/AT5/cmd/xml2txt/xml2txt.go @@ -55,7 +55,7 @@ func main() { x(err) var item Item x(xml.Unmarshal(b, &item), filename) - _, err = fp.WriteString(addEnd(item.Title)) + _, err = fp.WriteString(addEnd(fixSpace(item.Title))) x(err) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) @@ -63,7 +63,7 @@ func main() { pp, err := root.Search(`//body/p | //body/h2`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(p.Content())) + _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) x(err) } x(err) @@ -90,3 +90,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go index 96869d9..2634084 100644 --- a/GG/cmd/gg/gg.go +++ b/GG/cmd/gg/gg.go @@ -162,11 +162,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n fp, err := os.Create(filename + ".txt") x(err) - _, err = fp.WriteString(addEnd(title)) + _, err = fp.WriteString(addEnd(fixSpace(title))) x(err) for _, el := range ell { - _, err = fp.WriteString(addEnd(el.Content())) + _, err = fp.WriteString(addEnd(fixSpace(el.Content()))) x(err) } @@ -196,3 +196,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/NOS/cmd/json2txt/json2txt.go b/NOS/cmd/json2txt/json2txt.go index cc51e9f..b898ed6 100644 --- a/NOS/cmd/json2txt/json2txt.go +++ b/NOS/cmd/json2txt/json2txt.go @@ -57,16 +57,16 @@ func main() { var item Item x(json.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) x(err) } - for _, cat := range item.Tags { - _, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat) + for _, tag := range item.Tags { + _, err = fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)) x(err) } - _, err = fp.WriteString(addEnd(item.Title)) + _, err = fp.WriteString(addEnd(fixSpace(item.Title))) x(err) - _, err = fp.WriteString(item.Text) + _, err = fp.WriteString(fixSpace(item.Text)) x(err) x(fp.Close()) } @@ -91,3 +91,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go index 2749a07..15752ab 100644 --- a/NU/cmd/nu/nu.go +++ b/NU/cmd/nu/nu.go @@ -206,6 +206,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool // text bevat kopjes zonder punt aan het eind lines := strings.Split(text, "\n") for i, line := range lines { + line = fixSpace(line) n := len(line) if n > 0 { if strings.ContainsAny(line[n-1:], ".!?") { @@ -229,7 +230,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool x(err) } else { for _, cat := range cats { - _, err := fmt.Fprintf(fp, "##META text cat = %s\n", cat) + _, err := fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) x(err) } } @@ -241,3 +242,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return true } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go index f1cda3d..e7b9166 100644 --- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -149,12 +149,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n } else { for _, a := range aa { cat = strings.ReplaceAll(a.Content(), "\n", " ") - _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat) + _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)) x(err) } } - _, err = buf.WriteString(addEnd(title)) + _, err = buf.WriteString(addEnd(fixSpace(title))) x(err) // oud: //div[@id="article-blocks"]//p @@ -184,7 +184,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n return false // echt fout } for _, p := range pp { - _, err = buf.WriteString(addEnd(p.Content())) + _, err = buf.WriteString(addEnd(fixSpace(p.Content()))) x(err) } @@ -218,3 +218,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 5631268..4e705bd 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -57,10 +57,10 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) x(err) } - _, err = fp.WriteString(addEnd(item.Title)) + _, err = fp.WriteString(addEnd(fixSpace(item.Title))) x(err) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) @@ -73,7 +73,7 @@ func main() { pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(p.Content())) + _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) x(err) } x(err) @@ -100,3 +100,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go index 61fd657..a55ed6b 100644 --- a/Sargasso/cmd/xml2txt/xml2txt.go +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -57,10 +57,10 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) x(err) } - _, err = fp.WriteString(addEnd(item.Title)) + _, err = fp.WriteString(addEnd(fixSpace(item.Title))) x(err) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) @@ -68,7 +68,7 @@ func main() { pp, err := root.Search(`//body//p`) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(p.Content())) + _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) x(err) } x(err) @@ -95,3 +95,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go index 47e8fbf..1064ce6 100644 --- a/Sikkom/cmd/sikkom/sikkom.go +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -198,11 +198,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n fp, err = os.Create(filename + ".txt") x(err) - _, err = fp.WriteString(addEnd(title)) + _, err = fp.WriteString(addEnd(fixSpace(title))) x(err) for _, p := range pp { - _, err = fp.WriteString(addEnd(p.Content())) + _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) x(err) } @@ -229,3 +229,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 5e3d7e1..54b6384 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -57,10 +57,10 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - _, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) + _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)) x(err) } - _, err = fp.WriteString(addEnd(item.Title)) + _, err = fp.WriteString(addEnd(fixSpace(item.Title))) x(err) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) x(err) @@ -70,7 +70,7 @@ func main() { for _, p := range pp { s := p.Content() if !strings.Contains(s, "verscheen eerst op Tzum.") { - _, err = fp.WriteString(addEnd(p.Content())) + _, err = fp.WriteString(addEnd(fixSpace(p.Content()))) x(err) } } @@ -98,3 +98,7 @@ func addEnd(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +} diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index f59ce97..c4eef5b 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -220,7 +220,7 @@ func doArticle(filename string, url string, title string, tags []string, labels x(err) } else { for _, tag := range tags { - _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", tag) + _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag)) x(err) } } @@ -229,12 +229,12 @@ func doArticle(filename string, url string, title string, tags []string, labels x(err) } else { for _, label := range labels { - _, err = fmt.Fprintf(&buf, "##META text label = %s\n", label) + _, err = fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label)) x(err) } } - _, err = buf.WriteString(clean(title)) + _, err = buf.WriteString(addEnd(fixSpace(title))) x(err) fouten := make([]string, 0) @@ -243,7 +243,7 @@ func doArticle(filename string, url string, title string, tags []string, labels pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`) x(err) for _, p := range pp { - _, err = fmt.Fprint(&buf, clean(p.Content())) + _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content()))) x(err) found = true } @@ -259,7 +259,7 @@ func doArticle(filename string, url string, title string, tags []string, labels `//div[@data-sentry-component="ArticleTitle"]//h2`) x(err) for _, p := range pp { - _, err = fmt.Fprint(&buf, clean(p.Content())) + _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content()))) x(err) found = true } @@ -299,8 +299,8 @@ func doArticle(filename string, url string, title string, tags []string, labels return true } -func clean(s string) string { - s = strings.Join(strings.Fields(s), " ") +func addEnd(s string) string { + s = strings.TrimSpace(s) n := len(s) if n == 0 { return "" @@ -318,3 +318,7 @@ func clean(s string) string { } return s + ".\n" } + +func fixSpace(s string) string { + return strings.Join(strings.Fields(s), " ") +}