1、爬百度贴吧并发版
示例:
package mainimport ( "fmt" "net/http" "os" "strconv")//爬取网页内容func HttpGet(url string) (result string, err error) { resp, err1 := http.Get(url) if err1 != nil { err = err1 return } defer resp.Body.Close() //读取网页body内容 buf := make([]byte, 1024*4) for { n, _ := resp.Body.Read(buf) if n == 0 { //读取结束,或者,出问题 //fmt.Println("resp.Body.Read err = ", err) break } result += string(buf[:n]) } return}//爬取一个网页func SpiderPage(i int, page chan int) { url := "http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv.Itoa((i-1)*50) fmt.Printf("正在爬第%d页网页: %s\n", i, url) //2) 爬 (将所有的网站的内容全部爬下来) result, err := HttpGet(url) if err != nil { fmt.Println("HttpGet err = ", err) return } //把内容写入到文件 fileName := strconv.Itoa(i) + ".html" f, err1 := os.Create(fileName) if err1 != nil { fmt.Println("os.Create err1 = ", err1) return } f.WriteString(result) //写内容 f.Close() //关闭文件 page <- i}func DoWork(start, end int) { fmt.Printf("正在爬取 %d 到 %d 的页面\n", start, end) page := make(chan int) //明确目标 (要知道你准备在哪个范围或者网站去搜索) for i := start; i <= end; i++ { go SpiderPage(i, page) } for i := start; i <= end; i++ { fmt.Printf("%第%d个页面爬取完成\n", <-page) }}func main() { var start, end int fmt.Printf("请输入起始页( >= 1) :") fmt.Scan(&start) fmt.Printf("请输入终止页( >= 起始页) :") fmt.Scan(&end) DoWork(start, end)}
执行结果:
D:\GoFiles\src\hello_01>go run get_web.go请输入起始页( >= 1) :1请输入终止页( >= 起始页) :10正在爬取 1 到 10 的页面正在爬第3页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=100正在爬第10页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=450正在爬第6页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=250正在爬第7页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=300正在爬第8页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=350正在爬第9页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=400正在爬第2页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=50正在爬第4页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=150正在爬第5页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=200正在爬第1页网页: http://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=0%!第(int=4)%!d(MISSING)个页面爬取完成%!第(int=9)%!d(MISSING)个页面爬取完成%!第(int=3)%!d(MISSING)个页面爬取完成%!第(int=8)%!d(MISSING)个页面爬取完成%!第(int=10)%!d(MISSING)个页面爬取完成%!第(int=1)%!d(MISSING)个页面爬取完成%!第(int=5)%!d(MISSING)个页面爬取完成%!第(int=2)%!d(MISSING)个页面爬取完成%!第(int=7)%!d(MISSING)个页面爬取完成%!第(int=6)%!d(MISSING)个页面爬取完成