speciale spaties -> gewone spaties

This commit is contained in:
Peter Kleiweg
2026-03-02 21:41:13 +01:00
parent a94b190108
commit fd750a8d47
10 changed files with 72 additions and 31 deletions

View File

@@ -55,7 +55,7 @@ func main() {
x(err) x(err)
var item Item var item Item
x(xml.Unmarshal(b, &item), filename) x(xml.Unmarshal(b, &item), filename)
_, err = fp.WriteString(addEnd(item.Title)) _, err = fp.WriteString(addEnd(fixSpace(item.Title)))
x(err) x(err)
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err) x(err)
@@ -63,7 +63,7 @@ func main() {
pp, err := root.Search(`//body/p | //body/h2`) pp, err := root.Search(`//body/p | //body/h2`)
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content())) _, err = fp.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
x(err) x(err)
@@ -90,3 +90,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -162,11 +162,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err := os.Create(filename + ".txt") fp, err := os.Create(filename + ".txt")
x(err) x(err)
_, err = fp.WriteString(addEnd(title)) _, err = fp.WriteString(addEnd(fixSpace(title)))
x(err) x(err)
for _, el := range ell { for _, el := range ell {
_, err = fp.WriteString(addEnd(el.Content())) _, err = fp.WriteString(addEnd(fixSpace(el.Content())))
x(err) x(err)
} }
@@ -196,3 +196,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -57,16 +57,16 @@ func main() {
var item Item var item Item
x(json.Unmarshal(b, &item)) x(json.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
for _, cat := range item.Tags { for _, tag := range item.Tags {
_, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat) _, err = fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))
x(err) x(err)
} }
_, err = fp.WriteString(addEnd(item.Title)) _, err = fp.WriteString(addEnd(fixSpace(item.Title)))
x(err) x(err)
_, err = fp.WriteString(item.Text) _, err = fp.WriteString(fixSpace(item.Text))
x(err) x(err)
x(fp.Close()) x(fp.Close())
} }
@@ -91,3 +91,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -206,6 +206,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
// text bevat kopjes zonder punt aan het eind // text bevat kopjes zonder punt aan het eind
lines := strings.Split(text, "\n") lines := strings.Split(text, "\n")
for i, line := range lines { for i, line := range lines {
line = fixSpace(line)
n := len(line) n := len(line)
if n > 0 { if n > 0 {
if strings.ContainsAny(line[n-1:], ".!?") { if strings.ContainsAny(line[n-1:], ".!?") {
@@ -229,7 +230,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
x(err) x(err)
} else { } else {
for _, cat := range cats { for _, cat := range cats {
_, err := fmt.Fprintf(fp, "##META text cat = %s\n", cat) _, err := fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
} }
@@ -241,3 +242,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return true return true
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -149,12 +149,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
} else { } else {
for _, a := range aa { for _, a := range aa {
cat = strings.ReplaceAll(a.Content(), "\n", " ") cat = strings.ReplaceAll(a.Content(), "\n", " ")
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat) _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
} }
_, err = buf.WriteString(addEnd(title)) _, err = buf.WriteString(addEnd(fixSpace(title)))
x(err) x(err)
// oud: //div[@id="article-blocks"]//p // oud: //div[@id="article-blocks"]//p
@@ -184,7 +184,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
return false // echt fout return false // echt fout
} }
for _, p := range pp { for _, p := range pp {
_, err = buf.WriteString(addEnd(p.Content())) _, err = buf.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
@@ -218,3 +218,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -57,10 +57,10 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
_, err = fp.WriteString(addEnd(item.Title)) _, err = fp.WriteString(addEnd(fixSpace(item.Title)))
x(err) x(err)
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err) x(err)
@@ -73,7 +73,7 @@ func main() {
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`) pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content())) _, err = fp.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
x(err) x(err)
@@ -100,3 +100,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -57,10 +57,10 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
_, err = fp.WriteString(addEnd(item.Title)) _, err = fp.WriteString(addEnd(fixSpace(item.Title)))
x(err) x(err)
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err) x(err)
@@ -68,7 +68,7 @@ func main() {
pp, err := root.Search(`//body//p`) pp, err := root.Search(`//body//p`)
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content())) _, err = fp.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
x(err) x(err)
@@ -95,3 +95,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -198,11 +198,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
fp, err = os.Create(filename + ".txt") fp, err = os.Create(filename + ".txt")
x(err) x(err)
_, err = fp.WriteString(addEnd(title)) _, err = fp.WriteString(addEnd(fixSpace(title)))
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content())) _, err = fp.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
@@ -229,3 +229,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -57,10 +57,10 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat) _, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
x(err) x(err)
} }
_, err = fp.WriteString(addEnd(item.Title)) _, err = fp.WriteString(addEnd(fixSpace(item.Title)))
x(err) x(err)
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
x(err) x(err)
@@ -70,7 +70,7 @@ func main() {
for _, p := range pp { for _, p := range pp {
s := p.Content() s := p.Content()
if !strings.Contains(s, "verscheen eerst op Tzum.") { if !strings.Contains(s, "verscheen eerst op Tzum.") {
_, err = fp.WriteString(addEnd(p.Content())) _, err = fp.WriteString(addEnd(fixSpace(p.Content())))
x(err) x(err)
} }
} }
@@ -98,3 +98,7 @@ func addEnd(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}

View File

@@ -220,7 +220,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
x(err) x(err)
} else { } else {
for _, tag := range tags { for _, tag := range tags {
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", tag) _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag))
x(err) x(err)
} }
} }
@@ -229,12 +229,12 @@ func doArticle(filename string, url string, title string, tags []string, labels
x(err) x(err)
} else { } else {
for _, label := range labels { for _, label := range labels {
_, err = fmt.Fprintf(&buf, "##META text label = %s\n", label) _, err = fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label))
x(err) x(err)
} }
} }
_, err = buf.WriteString(clean(title)) _, err = buf.WriteString(addEnd(fixSpace(title)))
x(err) x(err)
fouten := make([]string, 0) fouten := make([]string, 0)
@@ -243,7 +243,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`) pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`)
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fmt.Fprint(&buf, clean(p.Content())) _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content())))
x(err) x(err)
found = true found = true
} }
@@ -259,7 +259,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
`//div[@data-sentry-component="ArticleTitle"]//h2`) `//div[@data-sentry-component="ArticleTitle"]//h2`)
x(err) x(err)
for _, p := range pp { for _, p := range pp {
_, err = fmt.Fprint(&buf, clean(p.Content())) _, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content())))
x(err) x(err)
found = true found = true
} }
@@ -299,8 +299,8 @@ func doArticle(filename string, url string, title string, tags []string, labels
return true return true
} }
func clean(s string) string { func addEnd(s string) string {
s = strings.Join(strings.Fields(s), " ") s = strings.TrimSpace(s)
n := len(s) n := len(s)
if n == 0 { if n == 0 {
return "" return ""
@@ -318,3 +318,7 @@ func clean(s string) string {
} }
return s + ".\n" return s + ".\n"
} }
func fixSpace(s string) string {
return strings.Join(strings.Fields(s), " ")
}