Top20
This commit is contained in:
62
cmd/top20/top20.go
Normal file
62
cmd/top20/top20.go
Normal file
@@ -0,0 +1,62 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
e "codeberg.org/pebbe/errors"
|
||||
|
||||
"bufio"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
x = e.ExitErr
|
||||
reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
|
||||
seen = make(map[string]bool)
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
m := reFile.FindStringSubmatch(os.Args[1])
|
||||
infile := m[0]
|
||||
prefix := m[1]
|
||||
suffix := m[3] + ".t20"
|
||||
target := infile + ".t20"
|
||||
|
||||
x(os.Chdir("/net/corpora/nlnieuws/data"))
|
||||
|
||||
files, err := os.ReadDir(".")
|
||||
x(err)
|
||||
for _, file := range files {
|
||||
name := file.Name()
|
||||
if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
|
||||
fp, err := os.Open(name)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fp)
|
||||
for scanner.Scan() {
|
||||
seen[strings.SplitN(scanner.Text(), "\t", 2)[1]] = true
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fp.Close())
|
||||
}
|
||||
}
|
||||
|
||||
fpin, err := os.Open(infile)
|
||||
x(err)
|
||||
fpout, err := os.Create(target)
|
||||
x(err)
|
||||
scanner := bufio.NewScanner(fpin)
|
||||
n := 0
|
||||
for scanner.Scan() && n < 20 {
|
||||
line := scanner.Text()
|
||||
w := strings.SplitN(line, "\t", 2)[1]
|
||||
if seen[w] {
|
||||
continue
|
||||
}
|
||||
n++
|
||||
x(fpout.WriteString(line + "\n"))
|
||||
}
|
||||
x(scanner.Err())
|
||||
x(fpout.Close())
|
||||
x(fpin.Close())
|
||||
}
|
||||
Reference in New Issue
Block a user