沙滩星空的博客沙滩星空的博客

谷歌浏览器爬虫(chromedp)页面请求超时的解决方案

缘起

服务器上部署了谷歌浏览器以 headless 模式(即无头模式,无GUI界面模式)运行,发现亚马逊一个页面请求,页面一直未加载完成,卡死了。卡了一天的时间。跑不下去了。。。

解决

我的代码封装了一个 ChromeService 服务,专门用来做浏览器爬虫。
可以使用标准库的 time.AfterFunc 函数解决
在服务中新增一个方法: RunNavigateAndOutHtmlWithTimeout:


func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
    wg := sync.WaitGroup{}
    lock := sync.Mutex{}
    hasOuter := false
    wg.Add(1)
    time.AfterFunc(d, func() {
        lock.Lock()
        if !hasOuter {
            fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
            chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
            hasOuter = true
            wg.Done()
        }
        lock.Unlock()
    })
    go func() {
        // 若执行超时未及时返回,则立即执行页面解析
        chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
        lock.Lock()
        if !hasOuter {
            fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
            chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
            hasOuter = true
            wg.Done()
        }
        lock.Unlock()
    }()
    wg.Wait()
}

完整代码:

package service

import (
    "bytes"
    "context"
    "errors"
    "fmt"
    "log"
    "math/rand"
    "os"
    "os/exec"
    "runtime"
    "stspider/config"
    "stspider/osfile"
    "stspider/util"
    "sync"
    "time"

    "github.com/chromedp/cdproto/network"
    "github.com/chromedp/chromedp"
)


type ChromeService struct {
    isStartedChromedp bool
    ChromePath        string
    ChromeRunArgs     []string
    DebuggerAddress   string // "ws://127.0.0.1:9222"
    AllocatorCtx      context.Context
    AllocatorCancel   context.CancelFunc
    RunCtx            context.Context
    RunCancel         context.CancelFunc
}
func NewChrome() *ChromeService {
    chromeService := &ChromeService{}
    config := config.NewChromeConfigData()
    chromeService.ChromePath = config.ChromePath
    chromeService.DebuggerAddress = config.DebuggerAddress
    chromeService.ChromeRunArgs = config.GetRunArgs()
    return chromeService
}
// StartChromedp use func Cancel() to Close
func (c *ChromeService) StartChromedp() {
    if c.isStartedChromedp {
        return
    }
    if c.DebuggerAddress == "" {
        opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("headless", false))
        c.AllocatorCtx, c.AllocatorCancel = chromedp.NewExecAllocator(context.Background(), opts...)
    } else {
        c.AllocatorCtx, c.AllocatorCancel = chromedp.NewRemoteAllocator(context.Background(), c.DebuggerAddress)
    }
    c.RunCtx, c.RunCancel = chromedp.NewContext(c.AllocatorCtx)
    c.isStartedChromedp = true
}


var defaultChromePaths = map[string]string{
    "windows": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
    "linux":   "/usr/bin/google-chrome-stable",
    "darwin":  "/usr/bin/chrome",
}
var startBrowserCmds = map[string]func(c *ChromeService) *exec.Cmd{
    "windows": func(c *ChromeService) *exec.Cmd { return exec.Command(c.getFileToRunChrome(FILE_EXT_BAT)) },
    "linux":   func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) }, // c.getFileToRunChrome(FILE_EXT_SH)
    "darwin":  func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) },
}

// StartBrowser() Start chrome in remote mode.
// Like: "C:/Program Files/Google/Chrome/Application/chrome.exe" --remote-debugging-port=9222
// TODO user-data-dir 目录没有写入权限
func (c *ChromeService) StartBrowser() error {
    if c.ChromePath == "" {
        c.ChromePath = defaultChromePaths[runtime.GOOS]
    }
    fmt.Println(c.ChromePath)
    fmt.Println(c.ChromeRunArgs)
    if !osfile.IsPathExists(c.ChromePath) {
        errMsg := "can not find chrome in path: " + c.ChromePath
        util.GetLogger().Debug(errMsg)
        return errors.New(errMsg)
    }

    port := config.NewChromeConfigData().GetDebuggerPort()
    log.Println("---GetDebuggerPort---", port)
    chromePid := util.GetPidByPort(port)
    log.Println("-----chromePid---", chromePid)

    // 端口未被进程使用,则启动浏览器
    if chromePid <= 0 {
        log.Println("-----StartBrowser---")
        // cmd := exec.Command(`C:\Program Files\Google\Chrome\Application\chrome.exe`, `--user-data-dir="D:\projects\golang\stspider\runtime\chrome_user_data"`)
        cmd := startBrowserCmds[runtime.GOOS](c)
        err := cmd.Start()
        if err != nil {
            log.Println("Error Happend In StartBrowser:", err)
            return err
        }
        time.Sleep(1 * time.Second)
        var outBytes, errBytes bytes.Buffer
        cmd.Stdout = &outBytes
        cmd.Stderr = &errBytes
        log.Printf("\n----Exected:--out:%s----err:%s----\n", outBytes.String(), errBytes.String())
    } else {
        log.Println("--Skip----Browser--Started---")
    }

    return nil
}

const FILE_EXT_BAT = `.bat`
const FILE_EXT_SH = `.sh`

func (c ChromeService) getFileToRunChrome(ext string) string {
    appPath := config.GetAppPath()
    filepath := appPath.GetByRuntimePath("chrome" + ext)
    fileHandler, err := os.OpenFile(filepath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
    if err != nil {
        fmt.Printf("----------ERROR--%v \n", err)
    }
    fileContent := ""
    if ext == FILE_EXT_SH {
        fileContent += `#!/bin/sh
`
    }
    fileContent += `"` + c.ChromePath + `"`
    for _, arg := range c.ChromeRunArgs {
        fileContent += " " + arg
    }
    fmt.Println(fileContent)
    fileHandler.WriteString(fileContent)
    fileHandler.Close()
    return filepath
}


func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
    wg := sync.WaitGroup{}
    lock := sync.Mutex{}
    hasOuter := false
    wg.Add(1)
    time.AfterFunc(d, func() {
        lock.Lock()
        if !hasOuter {
            fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
            chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
            hasOuter = true
            wg.Done()
        }
        lock.Unlock()
    })
    go func() {
        // 若执行超时未及时返回,则立即执行页面解析
        chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
        lock.Lock()
        if !hasOuter {
            fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
            chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
            hasOuter = true
            wg.Done()
        }
        lock.Unlock()
    }()
    wg.Wait()
}

使用:

    chrome := service.NewChrome()
    chrome.StartBrowser()
    chrome.StartChromedp()
    html := ""
    chrome.RunNavigateAndOutHtmlWithTimeout("https://www.amazon.com/dp/B091254PRB?language=zh_CN&th=1&psc=1", &html, 3*time.Second)

How to make chromedp.Navigate(url) timeout when the page is not fully loaded for a long time? https://github.com/chromedp/chromedp/issues/757
未经允许不得转载:沙滩星空的博客 » 谷歌浏览器爬虫(chromedp)页面请求超时的解决方案

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址