speciale spaties -> gewone spaties
This commit is contained in:
@@ -55,7 +55,7 @@ func main() {
|
|||||||
x(err)
|
x(err)
|
||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item), filename)
|
x(xml.Unmarshal(b, &item), filename)
|
||||||
_, err = fp.WriteString(addEnd(item.Title))
|
_, err = fp.WriteString(addEnd(fixSpace(item.Title)))
|
||||||
x(err)
|
x(err)
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
@@ -63,7 +63,7 @@ func main() {
|
|||||||
pp, err := root.Search(`//body/p | //body/h2`)
|
pp, err := root.Search(`//body/p | //body/h2`)
|
||||||
x(err)
|
x(err)
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fp.WriteString(addEnd(p.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
x(err)
|
x(err)
|
||||||
@@ -90,3 +90,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -162,11 +162,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
fp, err := os.Create(filename + ".txt")
|
fp, err := os.Create(filename + ".txt")
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
_, err = fp.WriteString(addEnd(title))
|
_, err = fp.WriteString(addEnd(fixSpace(title)))
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
for _, el := range ell {
|
for _, el := range ell {
|
||||||
_, err = fp.WriteString(addEnd(el.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(el.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -196,3 +196,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -57,16 +57,16 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(json.Unmarshal(b, &item))
|
x(json.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
for _, cat := range item.Tags {
|
for _, tag := range item.Tags {
|
||||||
_, err = fmt.Fprintf(fp, "##META text tag = %s\n", cat)
|
_, err = fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
_, err = fp.WriteString(addEnd(item.Title))
|
_, err = fp.WriteString(addEnd(fixSpace(item.Title)))
|
||||||
x(err)
|
x(err)
|
||||||
_, err = fp.WriteString(item.Text)
|
_, err = fp.WriteString(fixSpace(item.Text))
|
||||||
x(err)
|
x(err)
|
||||||
x(fp.Close())
|
x(fp.Close())
|
||||||
}
|
}
|
||||||
@@ -91,3 +91,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -206,6 +206,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
// text bevat kopjes zonder punt aan het eind
|
// text bevat kopjes zonder punt aan het eind
|
||||||
lines := strings.Split(text, "\n")
|
lines := strings.Split(text, "\n")
|
||||||
for i, line := range lines {
|
for i, line := range lines {
|
||||||
|
line = fixSpace(line)
|
||||||
n := len(line)
|
n := len(line)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
if strings.ContainsAny(line[n-1:], ".!?") {
|
if strings.ContainsAny(line[n-1:], ".!?") {
|
||||||
@@ -229,7 +230,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
x(err)
|
x(err)
|
||||||
} else {
|
} else {
|
||||||
for _, cat := range cats {
|
for _, cat := range cats {
|
||||||
_, err := fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
_, err := fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -241,3 +242,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
|
|||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -149,12 +149,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
} else {
|
} else {
|
||||||
for _, a := range aa {
|
for _, a := range aa {
|
||||||
cat = strings.ReplaceAll(a.Content(), "\n", " ")
|
cat = strings.ReplaceAll(a.Content(), "\n", " ")
|
||||||
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat)
|
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = buf.WriteString(addEnd(title))
|
_, err = buf.WriteString(addEnd(fixSpace(title)))
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
// oud: //div[@id="article-blocks"]//p
|
// oud: //div[@id="article-blocks"]//p
|
||||||
@@ -184,7 +184,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
return false // echt fout
|
return false // echt fout
|
||||||
}
|
}
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = buf.WriteString(addEnd(p.Content()))
|
_, err = buf.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -218,3 +218,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -57,10 +57,10 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
_, err = fp.WriteString(addEnd(item.Title))
|
_, err = fp.WriteString(addEnd(fixSpace(item.Title)))
|
||||||
x(err)
|
x(err)
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
@@ -73,7 +73,7 @@ func main() {
|
|||||||
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
|
||||||
x(err)
|
x(err)
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fp.WriteString(addEnd(p.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
x(err)
|
x(err)
|
||||||
@@ -100,3 +100,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -57,10 +57,10 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
_, err = fp.WriteString(addEnd(item.Title))
|
_, err = fp.WriteString(addEnd(fixSpace(item.Title)))
|
||||||
x(err)
|
x(err)
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
@@ -68,7 +68,7 @@ func main() {
|
|||||||
pp, err := root.Search(`//body//p`)
|
pp, err := root.Search(`//body//p`)
|
||||||
x(err)
|
x(err)
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fp.WriteString(addEnd(p.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
x(err)
|
x(err)
|
||||||
@@ -95,3 +95,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -198,11 +198,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
|||||||
fp, err = os.Create(filename + ".txt")
|
fp, err = os.Create(filename + ".txt")
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
_, err = fp.WriteString(addEnd(title))
|
_, err = fp.WriteString(addEnd(fixSpace(title)))
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fp.WriteString(addEnd(p.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -229,3 +229,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -57,10 +57,10 @@ func main() {
|
|||||||
var item Item
|
var item Item
|
||||||
x(xml.Unmarshal(b, &item))
|
x(xml.Unmarshal(b, &item))
|
||||||
for _, cat := range item.Cats {
|
for _, cat := range item.Cats {
|
||||||
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", cat)
|
_, err = fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
_, err = fp.WriteString(addEnd(item.Title))
|
_, err = fp.WriteString(addEnd(fixSpace(item.Title)))
|
||||||
x(err)
|
x(err)
|
||||||
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
|
||||||
x(err)
|
x(err)
|
||||||
@@ -70,7 +70,7 @@ func main() {
|
|||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
s := p.Content()
|
s := p.Content()
|
||||||
if !strings.Contains(s, "verscheen eerst op Tzum.") {
|
if !strings.Contains(s, "verscheen eerst op Tzum.") {
|
||||||
_, err = fp.WriteString(addEnd(p.Content()))
|
_, err = fp.WriteString(addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -98,3 +98,7 @@ func addEnd(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
@@ -220,7 +220,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
|
|||||||
x(err)
|
x(err)
|
||||||
} else {
|
} else {
|
||||||
for _, tag := range tags {
|
for _, tag := range tags {
|
||||||
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", tag)
|
_, err = fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -229,12 +229,12 @@ func doArticle(filename string, url string, title string, tags []string, labels
|
|||||||
x(err)
|
x(err)
|
||||||
} else {
|
} else {
|
||||||
for _, label := range labels {
|
for _, label := range labels {
|
||||||
_, err = fmt.Fprintf(&buf, "##META text label = %s\n", label)
|
_, err = fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label))
|
||||||
x(err)
|
x(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = buf.WriteString(clean(title))
|
_, err = buf.WriteString(addEnd(fixSpace(title)))
|
||||||
x(err)
|
x(err)
|
||||||
|
|
||||||
fouten := make([]string, 0)
|
fouten := make([]string, 0)
|
||||||
@@ -243,7 +243,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
|
|||||||
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`)
|
pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//p[contains(@class,"prose-article-body-r")]`)
|
||||||
x(err)
|
x(err)
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fmt.Fprint(&buf, clean(p.Content()))
|
_, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
found = true
|
found = true
|
||||||
}
|
}
|
||||||
@@ -259,7 +259,7 @@ func doArticle(filename string, url string, title string, tags []string, labels
|
|||||||
`//div[@data-sentry-component="ArticleTitle"]//h2`)
|
`//div[@data-sentry-component="ArticleTitle"]//h2`)
|
||||||
x(err)
|
x(err)
|
||||||
for _, p := range pp {
|
for _, p := range pp {
|
||||||
_, err = fmt.Fprint(&buf, clean(p.Content()))
|
_, err = fmt.Fprint(&buf, addEnd(fixSpace(p.Content())))
|
||||||
x(err)
|
x(err)
|
||||||
found = true
|
found = true
|
||||||
}
|
}
|
||||||
@@ -299,8 +299,8 @@ func doArticle(filename string, url string, title string, tags []string, labels
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func clean(s string) string {
|
func addEnd(s string) string {
|
||||||
s = strings.Join(strings.Fields(s), " ")
|
s = strings.TrimSpace(s)
|
||||||
n := len(s)
|
n := len(s)
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
return ""
|
return ""
|
||||||
@@ -318,3 +318,7 @@ func clean(s string) string {
|
|||||||
}
|
}
|
||||||
return s + ".\n"
|
return s + ".\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fixSpace(s string) string {
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user