Crawling a site and downloading files

This page describes one of my first projects involving Go. Go has some great third-party packages for web-scraping, but I wanted to write a web-scraper using components from the standard library. I ultimately did find it necessary to use the official HTML package from Go in order to parse and tokenize HTML pages. As time went by I also found it helpful to use the third-party package termenv in order to stylize terminal output and make errors easier to see in the verbose terminal output.





  • First i've defined helper functions.
  • The site I am crawling has download pages for six different languages; I've written a helper function that maps integers to string abbreviations of these languages, so as to create directories on the local filesystem for each language.
  • Some download pages may contain links to the same downloads, so I've created a function that can remove these duplicates.
  • The site I am crawling has downloads organized by the authors of the files. For this reason my program first finds all links that point to these author pages, and will then scrape these pages to find specific downloads.
package main

import (
    "fmt"
    "io"
    "net/http"
    "os"
    "strings"

    "golang.org/x/net/html"
)

func get_lang(num int) string {
    switch num {
    case 0:
        return "en"
    case 1:
        return "es"
    case 2:
        return "fr"
    case 3:
        return "po"
    case 4:
        return "it"
    case 5:
        return "de"
    }

    return "invalid language"
}

func removeDuplicates(input []string) []string {
    encountered := map[string]bool{}
    result := []string{}

    for _, value := range input {
        if !encountered[value] {
            encountered[value] = true
            result = append(result, value)
        }
    }

    return result
}

func scrape_authors(url string) ([]string, error) {
    resp, err := http.Get(url)
    if err != nil {
        return nil, fmt.Errorf("unable to fetch URL %s: %w", url, err)
    }
    defer resp.Body.Close()

    z := html.NewTokenizer(resp.Body)

    var links []string

    for {
        tt := z.Next()

        switch {
        case tt == html.ErrorToken:
            if z.Err() == io.EOF {
                return links, nil
            }
            return nil, z.Err()
        case tt == html.StartTagToken:
            t := z.Token()

            if t.Data == "a" {
                for _, a := range t.Attr {
                    if a.Key == "href" && strings.HasSuffix(a.Val, "/") {
                        links = append(links, a.Val)
                    }
                }
            }
        }
    }
}

func scrape_zips(url string) ([]string, error) {
    resp, err := http.Get(url)
    if err != nil {
        return nil, fmt.Errorf("unable to fetch URL %s: %w", url, err)
    }
    defer resp.Body.Close()

    z := html.NewTokenizer(resp.Body)
    var links []string

    for {
        tt := z.Next()
        switch tt {
        case html.ErrorToken:
            if z.Err() == io.EOF {
                return links, nil
            }
            return nil, z.Err()
        case html.StartTagToken, html.SelfClosingTagToken:
            t := z.Token()
            if t.Data == "a" {
                for _, a := range t.Attr {
                    if a.Key == "href" {
                        if strings.HasSuffix(a.Val, ".zip") || strings.HasSuffix(a.Val, "-zip") {
                            links = append(links, a.Val)
                        }
                    }
                }
            }
        }
    }
}

func download(url string, dest *os.File) error {
    defer dest.Close()

    resp, err := http.Get(url)
    if err != nil {
        return err
    }
    defer resp.Body.Close()

    _, err = io.Copy(dest, resp.Body)

    return err
}                            
                        

Main program logic



  • The main function is the entry point to any Go program. Here, main first calls the function confirmWorkingDir that will determine the local directory from which this program was invoked. This is necessary later on to create subdirectories for each language and write downloaded files to this directory. This way a user can move the executable file to the place they would like these downloads to exist on their filesystem, or external storage.
  • The confirmWorkingDir function also prompts the user to ensure that they have enough space at this location to download all of the files. In the case of my target website, this is roughly 90GB.
  • I've used WaitGroups and Goroutines in the run function to crawl the download page for each language on its own thread. This significantly decreases runtime as author links for each language can be scraped in parallel.
  • Once the author pages for each language have been found, a new logical block will iterate over these author pages and scrape the links that directly point to .zip files. This will again occur on threads devoted to each language.
  • When this process completes duplicates are removed and the helper functions shown above are used to download the files.
  • Extensive error handling and terminal output has been added to inform the CLI application user of the progress of the program and any errors that occur.
package main

import (
    "bufio"
    "errors"
    "fmt"
    "log"
    "os"
    "path/filepath"
    "strings"
    "sync"

    "github.com/muesli/termenv"
)

const URL = "https://target-site.org/"

func main() {
    output := termenv.NewOutput(os.Stdout)
    msg := output.String("Welcome to the Archive Utility\n").
        Bold().
        Underline()
    fmt.Println(msg)

    confirmWorkingDir()

    run()

    fmt.Println("Press any key to exit...")
    bufio.NewReader(os.Stdin).ReadRune()
}

func confirmWorkingDir() {
    exPath, err := os.Executable()
    if err != nil {
        log.Fatal("Unable to detect working directory")
    }
    exDir := filepath.Dir(exPath)
    if err := os.Chdir(exDir); err != nil {
        log.Fatal("Unable to change working directory", err)
    }
    wd, err := os.Getwd()
    if err != nil {
        log.Fatal("Unable to detect working directory:", err)
    }
    fmt.Printf("Files will be downloaded to:  %s\n", wd)
    fmt.Println("Please confirm that you have 100GB free at this location by entering 'Y' to continue or any other key to quit:")

    reader := bufio.NewReader(os.Stdin)
    input, err := reader.ReadString('\n')
    if err != nil {
        log.Fatal("Error reading input:", err)
    }

    input = strings.TrimSpace(input)

    if strings.ToUpper(input) == "Y" {
        return
    } else {
        log.Fatal("User exited program")
    }
}

func run() {
    output := termenv.NewOutput(os.Stdout)
    var new_downloads []string
    var errors_ocurred []string
    var mu sync.Mutex

    // get client working directory and output so user knows where to locate downloaded files
    exPath, err := os.Executable()
    if err != nil {
        msg := output.String("Unable to detect working directory:").
            Bold().
            Underline().
            Foreground(output.Color("1"))
        fmt.Println(msg)

        return
    }
    exDir := filepath.Dir(exPath)
    if err := os.Chdir(exDir); err != nil {
        msg := output.String("Unable to change working directory:", err.Error()).
            Bold().
            Underline().
            Foreground(output.Color("1"))
        fmt.Println(msg)

        return
    }
    wd, err := os.Getwd()
    if err != nil {
        msg := output.String("Unable to detect working directory:", err.Error()).
            Bold().
            Underline().
            Foreground(output.Color("1"))
        fmt.Println(msg)

        return
    }
    fmt.Printf("Files will be downloaded to:  %s\n\n", wd)

    fmt.Printf("Searching for available downloads...\n\n")

    urls := [6]string{
        fmt.Sprintf(URL + "en/downloads/"),
        fmt.Sprintf(URL + "es/descargas/"),
        fmt.Sprintf(URL + "fr/telechargements/"),
        fmt.Sprintf(URL + "po/downloads/"),
        fmt.Sprintf(URL + "it/download/"),
        fmt.Sprintf(URL + "de/herunterladen/"),
    }

    /*
        Map containing int key and string array value. The string array will contain
        html pages of each author for a specific language, identified by the key.

        The helper function get_lang() will be used to convert the int key into a
        language abbreviation when iterating over the map.

        Keys must be integers for compatibility with the range based for loop used to
        iterate over this collection.
    */
    var pages = map[int][]string{
        0: {}, // en
        1: {}, // es
        2: {}, // fr
        3: {}, // po
        4: {}, // it
        5: {}, // de
    }

    var lang_wg sync.WaitGroup
    lang_wg.Add(len(urls))
    for i, v := range urls {
        go func(i int, v string) {
            defer lang_wg.Done()
            links, err := scrape_authors(v)
            if err != nil {
                msg := output.String(fmt.Sprintf("Error scraping authors from %s: %v", v, err)).
                    Foreground(output.Color("1"))
                fmt.Println(msg)
                mu.Lock()
                errors_ocurred = append(errors_ocurred, err.Error())
                mu.Unlock()
                return
            }
            mu.Lock()
            pages[i] = links
            mu.Unlock()
        }(i, v)
    }
    lang_wg.Wait()

    var pages_wg sync.WaitGroup
    pages_wg.Add(len(pages))
    for i, lang := range pages { // for each language, get .zip downloads from author links
        go func(i int, lang []string) {
            defer pages_wg.Done()

            var lang_zips []string

            log.Printf("Checking %v %v links for .zip files...\n", len(pages[i]), get_lang(i))

            downloadDir := get_lang(i) // create a local dir for current language
            if err := os.MkdirAll(downloadDir, 0755); err != nil {
                msg := output.String("Terminating because a directory could not be created:", err.Error()).
                    Foreground(output.Color("1"))
                fmt.Println(msg)

                return
            }

            for _, author := range lang {
                if strings.Contains(author, "/"+get_lang(i)+"/") {
                    log.Println("Found", author)
                    zips, err := scrape_zips(author)
                    if err != nil {
                        msg := output.String(fmt.Sprintf("Error scraping zips from %s: %v", author, err)).
                            Foreground(output.Color("1"))
                        fmt.Println(msg)
                        mu.Lock()
                        errors_ocurred = append(errors_ocurred, err.Error())
                        mu.Unlock()
                        continue
                    }
                    lang_zips = append(lang_zips, zips...)
                } else {
                    log.Printf("Skipping link %s. It does not match language %s", author, get_lang(i))
                }
            }

            unique := removeDuplicates(lang_zips)
            totalFiles := len(unique)

            for j, talk := range unique {
                filename := filepath.Base(talk)
                path_to_file := filepath.Join(downloadDir, filename)

                if _, err := os.Stat(path_to_file); err == nil { // file exits
                    fmt.Printf("%s %d/%d: File %s has been downloaded previously.\n", get_lang(i), j+1, totalFiles, talk)
                    continue
                } else if errors.Is(err, os.ErrNotExist) { // file does not exist

                    err := os.MkdirAll(filepath.Dir(path_to_file), 0755) // create dirdctory to hold file
                    if err != nil {
                        msg := output.String(fmt.Sprintf("Terminating because the directory %s could not be created: %v", path_to_file, err)).
                            Foreground(output.Color("1"))
                        fmt.Println(msg)

                        return
                    }
                    file, err := os.Create(path_to_file) // create file to download to
                    if err != nil {
                        msg := output.String(fmt.Sprintf("Terminating because the file %s could not be created: %v", path_to_file, err.Error())).
                            Foreground(output.Color("1"))
                        fmt.Println(msg)

                        return
                    }
                    if err := download(talk, file); err != nil { // donwload had errors
                        msg := output.String(fmt.Sprintf("%s %d/%d: Error downloading %s %v", get_lang(i), j+1, totalFiles, path_to_file, err.Error())).
                            Foreground(output.Color("1"))
                        fmt.Println(msg)
                        mu.Lock()
                        errors_ocurred = append(errors_ocurred, err.Error())
                        mu.Unlock()

                    } else { // download succeeded
                        msg := output.String(fmt.Sprintf("%s %d/%d: Downloaded: %s", get_lang(i), j+1, totalFiles, talk)).
                            Foreground(output.Color("34"))
                        fmt.Println(msg)
                        mu.Lock()
                        new_downloads = append(new_downloads, filename)
                        mu.Unlock()
                    }
                } else { // file does not exist and some other error ocurred
                    msg := output.String(fmt.Sprintf("%s %d/%d: Error downloading %s %v", get_lang(i), j+1, totalFiles, path_to_file, err.Error())).
                        Foreground(output.Color("1"))
                    fmt.Println(msg)
                    mu.Lock()
                    errors_ocurred = append(errors_ocurred, "Error downloading", talk, err.Error())
                    mu.Unlock()
                }

            }

        }(i, lang)
    }
    pages_wg.Wait()

    msg := output.String("All available files have been downloaded. New downloads include:").
        Bold().
        Underline()
    fmt.Println(msg)

    for _, name := range new_downloads {
        msg := output.String(name)
        fmt.Println(msg)
    }
    if len(new_downloads) == 0 {
        fmt.Println("None")
    }

    if len(errors_ocurred) > 0 {
        msg = output.String("\nThe following errors ocurred:").
            Bold().
            Underline().
            Foreground(output.Color("1"))
        fmt.Println(msg)

        for _, e := range errors_ocurred {
            msg := output.String(e).
                Foreground(output.Color("1"))
            fmt.Println(msg)
        }
    }
}
        

View the project source code on GitHub

Top Of Page