diff --git a/.gitignore b/.gitignore index 4d804ef..b16dbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ Tzum/xml2txt VRT/metadata VRT/vrt bin/ISOWeek +bin/score bin/week2files 20?? corpus diff --git a/Makefile b/Makefile index c0aada1..fa34931 100644 --- a/Makefile +++ b/Makefile @@ -11,11 +11,15 @@ all: make -C Tzum make -C VRT make bin/ISOWeek + make bin/score make bin/week2files bin/ISOWeek: cmd/ISOWeek/*.go go build -o $@ $^ +bin/score: cmd/score/*.go + go build -o $@ $^ + bin/week2files: cmd/week2files/*.go go build -o $@ $^ diff --git a/cmd/score/score.go b/cmd/score/score.go new file mode 100644 index 0000000..52b8c59 --- /dev/null +++ b/cmd/score/score.go @@ -0,0 +1,112 @@ +package main + +import ( + e "codeberg.org/pebbe/errors" + + "bufio" + "fmt" + "os" + "regexp" + "sort" + "strconv" + "strings" +) + +type Item struct { + text string + score int + isnew bool +} + +var ( + x = e.ExitErr + reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`) + count = make(map[string]int) + items = make([]Item, 0) +) + +func main() { + filename := os.Args[1] + prevname := getPrev(filename) + + fp, err := os.Open(prevname) + x(err) + scanner := bufio.NewScanner(fp) + for scanner.Scan() { + aa := strings.SplitN(scanner.Text(), "\t", 2) + n, err := strconv.Atoi(strings.TrimSpace(aa[0])) + x(err) + count[aa[1]] = n + } + x(scanner.Err()) + x(fp.Close()) + + fp, err = os.Open(filename) + x(err) + scanner = bufio.NewScanner(fp) + for scanner.Scan() { + aa := strings.SplitN(scanner.Text(), "\t", 2) + n, err := strconv.Atoi(strings.TrimSpace(aa[0])) + x(err) + n1, ok := count[aa[1]] + items = append(items, Item{ + text: aa[1], + score: n - n1, + isnew: !ok, + }) + } + x(scanner.Err()) + x(fp.Close()) + + sort.Slice(items, func(i, j int) bool { + /* + if items[i].isnew && !items[j].isnew { + return true + } + if !items[i].isnew && items[j].isnew { + return false + } + */ + if items[i].score != items[j].score { + return items[i].score > items[j].score + } + return items[i].text < items[j].text + }) + + for _, item := range items { + /* + if item.score < 2 { + break + } + */ + p := "." + if item.isnew { + p = "N" + } + fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text) + } + +} + +func getPrev(filename string) string { + mm := reYearWeek.FindStringSubmatch(filename) + year, err := strconv.Atoi(mm[2][:4]) + x(err) + week, err := strconv.Atoi(mm[2][5:]) + x(err) + + week-- + if week == 0 { + week = 53 + year-- + } + newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3]) + if week == 53 { + _, err := os.Stat(newname) + if err == nil { + return newname + } + newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3]) + } + return newname +} diff --git a/cmd/week2files/week2files.go b/cmd/week2files/week2files.go index 807e05a..a2884d0 100644 --- a/cmd/week2files/week2files.go +++ b/cmd/week2files/week2files.go @@ -25,6 +25,8 @@ func main() { week, err := strconv.Atoi(aa[1]) x(err) + weken, err := strconv.Atoi(os.Args[2]) + if year < 1000 || year > 9999 { x(fmt.Errorf("ongeldig year: %d", year)) } @@ -74,9 +76,9 @@ func main() { } t = t.AddDate(0, 0, 1-d) - // drie voorgaande weken - t2 := t.AddDate(0, 0, -21) - for i := range 4 { + // voorgaande weken en deze week + t2 := t.AddDate(0, 0, -7*(weken-1)) + for i := range weken { if i > 0 { fmt.Print(" -or") } @@ -85,9 +87,9 @@ func main() { t2 = t2.AddDate(0, 0, 7) } - // vanaf begin drie weken geleden t/m eind huidige week - t = t.AddDate(0, 0, -21) - for range 28 { + // vanaf begin voorgaande weken t/m eind huidige week + t = t.AddDate(0, 0, -7*(weken-1)) + for range 7 * weken { fmt.Printf(" -or -name %d-%02d-%02d.data.dz", t.Year(), t.Month(), t.Day()) t = t.AddDate(0, 0, 1) } diff --git a/collect.sh b/collect.sh index 68937d5..5211bd0 100755 --- a/collect.sh +++ b/collect.sh @@ -21,40 +21,61 @@ else esac fi -files=$(find /net/corpora/nlnieuws/ $(week2files $ds)) - cd /net/corpora/nlnieuws/data -alto \ - 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-namen-$ds +for i in 1 4 +do -alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ -his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-woorden-$ds + files=$(find .. $(week2files $ds $i)) -alto \ - 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ -his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ - 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > nieuw-extra-$ds + alto \ + 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ + tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-namen-$ds-$i -alto \ - 'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > locaties-$ds + score nieuw-namen-$ds-$i > nieuw-namen-$ds-$i.score -alto \ - 'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > personen-$ds + alto \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ + his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + tt:%w $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-woorden-$ds-$i -alto \ - 'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > organisatie-$ds + score nieuw-woorden-$ds-$i > nieuw-woorden-$ds-$i.score -alto \ - 'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \ - tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-$ds + alto \ + 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ + his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ + 'tt:%w\t%l\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > nieuw-extra-$ds-$i -alto \ - 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ - 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sort -nr > overigen-extra-$ds + score nieuw-extra-$ds-$i > nieuw-extra-$ds-$i.score + + alto \ + 'fp://node[(@neclass="LOC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"] and @his="normal")]' \ + tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > locaties-$ds-$i + + score locaties-$ds-$i > locaties-$ds-$i.score + + alto \ + 'fp://node[(@neclass="PER" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"] and @his="normal")]' \ + tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > personen-$ds-$i + + score personen-$ds-$i > personen-$ds-$i.score + + alto \ + 'fp://node[(@neclass="ORG" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"] and @his=" normal")]' \ + tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > organisaties-$ds-$i + + score organisaties-$ds-$i > organisaties-$ds-$i.score + + alto \ + 'fp://node[(@neclass="MISC" and @his="normal" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"] and @his="normal")]' \ + tt:%l $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-$ds-$i + + score overigen-$ds-$i > overigen-$ds-$i.score + + alto \ + 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ + 'tt:%w\t%P' $files | sort | uniq -c | grep -v '^ *1 ' | sed -e 's/\([0-9]\) */\1\t/' | sort -nr > overigen-extra-$ds-$i + + score overigen-extra-$ds-$i > overigen-extra-$ds-$i.score + +done