| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- package main
- import (
- "fmt"
- "io/ioutil"
- "path/filepath"
- "sort"
- "strings"
- )
- type Pair struct {
- key string
- value int
- }
- type AggPair struct {
- key string
- list []int
- }
- type PairSlice []Pair
- func (s PairSlice) Len() int {
- return len(s)
- }
- func (s PairSlice) Swap(i, j int) {
- s[i], s[j] = s[j], s[i]
- }
- func (s PairSlice) Less(i, j int) bool {
- if s[i].value == s[j].value {
- return s[i].key < s[j].key
- } else {
- return s[i].value > s[j].value
- }
- }
- func MapWorker(filename chan string, shuffle chan Pair) {
- dict := make(map[string]int)
- for name := range filename {
- data, _ := ioutil.ReadFile(name)
- word := ""
- for i := range data {
- if data[i] >= 'A' && data[i] <= 'Z' {
- data[i] += 'a' - 'A'
- }
- if data[i] >= 'a' && data[i] <= 'z' || word != "" && data[i] == '\'' {
- word = strings.Join([]string{word, string(data[i])}, "")
- } else if word != "" {
- dict[word]++
- word = ""
- }
- }
- if word != "" {
- dict[word]++
- }
- }
- for word, cnt := range dict {
- shuffle <- Pair{word, cnt}
- }
- shuffle <- Pair{"", 0}
- }
- func ReduceWorker(wordlist chan AggPair, output chan Pair) {
- for aggPair := range wordlist {
- sum := 0
- for _, value := range aggPair.list {
- sum += value
- }
- output <- Pair{aggPair.key, sum}
- }
- output <- Pair{"", 0}
- }
- func main() {
- const path = "../testdata"
- workerCnt := 0
- dataDir, err := ioutil.ReadDir(path)
- if err != nil {
- fmt.Println("Directory not found!")
- return
- }
- filename := make(chan string, 1000)
- go func() {
- for _, file := range dataDir {
- filename <- filepath.Join(path, file.Name())
- }
- close(filename)
- }()
- shuffle := make(chan Pair, 1000)
- for i := 0; i < 20; i++ {
- workerCnt++
- go MapWorker(filename, shuffle)
- }
- dict := make(map[string][]int)
- for pair := range shuffle {
- if pair.key == "" {
- workerCnt--
- if workerCnt == 0 {
- break
- }
- } else {
- dict[pair.key] = append(dict[pair.key], pair.value)
- }
- }
- wordlist := make(chan AggPair, 1000)
- go func() {
- for word, list := range dict {
- wordlist <- AggPair{word, list}
- }
- close(wordlist)
- }()
- output := make(chan Pair, 1000)
- for i := 0; i < 1000; i++ {
- workerCnt++
- go ReduceWorker(wordlist, output)
- }
- answer := make(PairSlice, 0)
- for pair := range output {
- if pair.key == "" {
- workerCnt--
- if workerCnt == 0 {
- break
- }
- } else {
- answer = append(answer, pair)
- }
- }
- sort.Sort(answer)
- for _, pair := range answer {
- fmt.Println(pair.key, pair.value)
- }
- }
|