缘起
服务器上部署了谷歌浏览器以 headless
模式(即无头模式,无GUI界面模式)运行,发现亚马逊一个页面请求,页面一直未加载完成,卡死了。卡了一天的时间。跑不下去了。。。
解决
我的代码封装了一个 ChromeService
服务,专门用来做浏览器爬虫。
可以使用标准库的 time.AfterFunc
函数解决
在服务中新增一个方法: RunNavigateAndOutHtmlWithTimeout
:
func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
wg := sync.WaitGroup{}
lock := sync.Mutex{}
hasOuter := false
wg.Add(1)
time.AfterFunc(d, func() {
lock.Lock()
if !hasOuter {
fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
hasOuter = true
wg.Done()
}
lock.Unlock()
})
go func() {
// 若执行超时未及时返回,则立即执行页面解析
chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
lock.Lock()
if !hasOuter {
fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
hasOuter = true
wg.Done()
}
lock.Unlock()
}()
wg.Wait()
}
完整代码:
package service
import (
"bytes"
"context"
"errors"
"fmt"
"log"
"math/rand"
"os"
"os/exec"
"runtime"
"stspider/config"
"stspider/osfile"
"stspider/util"
"sync"
"time"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
)
type ChromeService struct {
isStartedChromedp bool
ChromePath string
ChromeRunArgs []string
DebuggerAddress string // "ws://127.0.0.1:9222"
AllocatorCtx context.Context
AllocatorCancel context.CancelFunc
RunCtx context.Context
RunCancel context.CancelFunc
}
func NewChrome() *ChromeService {
chromeService := &ChromeService{}
config := config.NewChromeConfigData()
chromeService.ChromePath = config.ChromePath
chromeService.DebuggerAddress = config.DebuggerAddress
chromeService.ChromeRunArgs = config.GetRunArgs()
return chromeService
}
// StartChromedp use func Cancel() to Close
func (c *ChromeService) StartChromedp() {
if c.isStartedChromedp {
return
}
if c.DebuggerAddress == "" {
opts := append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("headless", false))
c.AllocatorCtx, c.AllocatorCancel = chromedp.NewExecAllocator(context.Background(), opts...)
} else {
c.AllocatorCtx, c.AllocatorCancel = chromedp.NewRemoteAllocator(context.Background(), c.DebuggerAddress)
}
c.RunCtx, c.RunCancel = chromedp.NewContext(c.AllocatorCtx)
c.isStartedChromedp = true
}
var defaultChromePaths = map[string]string{
"windows": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"linux": "/usr/bin/google-chrome-stable",
"darwin": "/usr/bin/chrome",
}
var startBrowserCmds = map[string]func(c *ChromeService) *exec.Cmd{
"windows": func(c *ChromeService) *exec.Cmd { return exec.Command(c.getFileToRunChrome(FILE_EXT_BAT)) },
"linux": func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) }, // c.getFileToRunChrome(FILE_EXT_SH)
"darwin": func(c *ChromeService) *exec.Cmd { return exec.Command(c.ChromePath, c.ChromeRunArgs...) },
}
// StartBrowser() Start chrome in remote mode.
// Like: "C:/Program Files/Google/Chrome/Application/chrome.exe" --remote-debugging-port=9222
// TODO user-data-dir 目录没有写入权限
func (c *ChromeService) StartBrowser() error {
if c.ChromePath == "" {
c.ChromePath = defaultChromePaths[runtime.GOOS]
}
fmt.Println(c.ChromePath)
fmt.Println(c.ChromeRunArgs)
if !osfile.IsPathExists(c.ChromePath) {
errMsg := "can not find chrome in path: " + c.ChromePath
util.GetLogger().Debug(errMsg)
return errors.New(errMsg)
}
port := config.NewChromeConfigData().GetDebuggerPort()
log.Println("---GetDebuggerPort---", port)
chromePid := util.GetPidByPort(port)
log.Println("-----chromePid---", chromePid)
// 端口未被进程使用,则启动浏览器
if chromePid <= 0 {
log.Println("-----StartBrowser---")
// cmd := exec.Command(`C:\Program Files\Google\Chrome\Application\chrome.exe`, `--user-data-dir="D:\projects\golang\stspider\runtime\chrome_user_data"`)
cmd := startBrowserCmds[runtime.GOOS](c)
err := cmd.Start()
if err != nil {
log.Println("Error Happend In StartBrowser:", err)
return err
}
time.Sleep(1 * time.Second)
var outBytes, errBytes bytes.Buffer
cmd.Stdout = &outBytes
cmd.Stderr = &errBytes
log.Printf("\n----Exected:--out:%s----err:%s----\n", outBytes.String(), errBytes.String())
} else {
log.Println("--Skip----Browser--Started---")
}
return nil
}
const FILE_EXT_BAT = `.bat`
const FILE_EXT_SH = `.sh`
func (c ChromeService) getFileToRunChrome(ext string) string {
appPath := config.GetAppPath()
filepath := appPath.GetByRuntimePath("chrome" + ext)
fileHandler, err := os.OpenFile(filepath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
if err != nil {
fmt.Printf("----------ERROR--%v \n", err)
}
fileContent := ""
if ext == FILE_EXT_SH {
fileContent += `#!/bin/sh
`
}
fileContent += `"` + c.ChromePath + `"`
for _, arg := range c.ChromeRunArgs {
fileContent += " " + arg
}
fmt.Println(fileContent)
fileHandler.WriteString(fileContent)
fileHandler.Close()
return filepath
}
func (c *ChromeService) RunNavigateAndOutHtmlWithTimeout(requrl string, html *string, d time.Duration) {
wg := sync.WaitGroup{}
lock := sync.Mutex{}
hasOuter := false
wg.Add(1)
time.AfterFunc(d, func() {
lock.Lock()
if !hasOuter {
fmt.Printf("-----RunNavigateAndOutHtmlWithTimeout---等待超时(%v),执行--AfterFunc\n", d)
chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
hasOuter = true
wg.Done()
}
lock.Unlock()
})
go func() {
// 若执行超时未及时返回,则立即执行页面解析
chromedp.Run(c.RunCtx, chromedp.Navigate(requrl))
lock.Lock()
if !hasOuter {
fmt.Printf("-----NavigateAndOutHtmlWithTimeout---指定时间(%v)内响应\n", d)
chromedp.Run(c.RunCtx, chromedp.OuterHTML("html", html))
hasOuter = true
wg.Done()
}
lock.Unlock()
}()
wg.Wait()
}
使用:
chrome := service.NewChrome()
chrome.StartBrowser()
chrome.StartChromedp()
html := ""
chrome.RunNavigateAndOutHtmlWithTimeout("https://www.amazon.com/dp/B091254PRB?language=zh_CN&th=1&psc=1", &html, 3*time.Second)
How to make chromedp.Navigate(url) timeout when the page is not fully loaded for a long time? https://github.com/chromedp/chromedp/issues/757