首页 > 代码库 > [Golang] 一个简易代理池

[Golang] 一个简易代理池

  晚上写了一个代理池,就是在一个代理网站上爬取代理ip和端口以及测试是否可用。接下来可能考虑扩展成一个比较大的 golang实现的代理池。

  简易版代码:

 1 package main
 2 
 3 import (
 4     "os"
 5     "fmt"
 6     log "github.com/Sirupsen/logrus"
 7     "io/ioutil"
 8     "strings"
 9 )
10 
11 type New struct {
12     Prefix string
13     NewId string
14     Title string
15     Time string
16     Content string
17     Subject string
18 }
19 
20 type Subject struct {
21     Name string
22     Url string
23 }
24 
25 func CreateDir(PathName string) error {
26     err := os.Mkdir(PathName, 0777)
27     if err != nil && !os.IsExist(err) {
28         return err
29     }
30     return nil
31 }
32 
33 func AppendFile(SavePath string, FileName string, buf string) {
34     out, err := os.OpenFile(SavePath+FileName, os.O_WRONLY, 0644)
35     defer out.Close()
36     if err != nil {
37         log.Errorln(err.Error())
38         return
39     }
40     offset, err := out.Seek(0, os.SEEK_END)
41     if err != nil {
42         log.Errorln(err.Error())
43         return
44     }
45     _, err = out.WriteAt([]byte(buf), offset)
46     if err != nil {
47         log.Errorln(err.Error())
48         return
49     }
50     log.Warnln("Save file finished. Locate in ", SavePath + FileName)
51 }
52 
53 func PathExists(path string) bool {
54     _, err := os.Stat(path)
55     if err == nil {
56         return true
57     }
58     if os.IsNotExist(err) {
59         return false
60     }
61     return false
62 }
63 
64 func SaveFile(SavePath string, FileName string, buf string) {
65     out, err := os.Create(SavePath + FileName)
66     defer out.Close()
67     fmt.Fprintf(out, "%s", buf)
68     if err != nil {
69         log.Errorln(err.Error())
70         return
71     }
72     log.Warnln("Save file finished. Locate in ", SavePath + FileName)
73 }
74 
75 func ReadAll(path string) ([]byte, error) {
76     f, err := os.Open(path)
77     if err != nil {
78         return nil, err
79     }
80     defer f.Close()
81     return ioutil.ReadAll(f)
82 }
83 
84 func ReadFile(path string) []string {
85     var fp interface{}
86     fp, err := ReadAll(path)
87     if err != nil {
88         log.Errorln(err.Error())
89         return nil
90     }
91     fp = string(fp.([]byte))
92     return strings.Split(fp.(string), "\n")
93 }
 1 package main
 2 
 3 import (
 4     log "github.com/Sirupsen/logrus"
 5     "math/rand"
 6     "net/http"
 7     "net/url"
 8     "time"
 9 )
10 
11 var userAgent = [...]string {
12     "Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
13     "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
14     "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
15     "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
16     "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
17     "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
18     "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
19     "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
20     "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
21     "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
22     "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
23     "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
24     "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
25     "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
26 }
27 
28 func GetRandomUserAgent() string {
29     var r = rand.New(rand.NewSource(time.Now().UnixNano()))
30     return userAgent[r.Intn(len(userAgent))]
31 }
32 
33 func GetFakeHeader(request *http.Request) {
34     request.Header.Set("User-Agent", GetRandomUserAgent())
35     request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
36     request.Header.Set("Connection", "keep-alive")
37     //request.Header.Set("Accept-Encoding", "gzip, deflate")
38 }
39 
40 func GetByProxy(proxyAddr string, Url string) (*http.Response, error) {
41     timeout := time.Duration(10 * time.Second)
42     request, err := http.NewRequest(http.MethodGet, Url, nil)
43     GetFakeHeader(request)
44     if err != nil {
45         return nil, err
46     }
47     proxy, err := url.Parse(proxyAddr)
48     if err != nil {
49         log.Errorln(err.Error())
50     }
51     client := &http.Client{
52         Transport: &http.Transport{
53             Proxy: http.ProxyURL(proxy),
54         },
55         Timeout: timeout,
56     }
57     return client.Do(request)
58 }
59 
60 func GetByDirectory(Url string) (*http.Response, error) {
61     timeout := time.Duration(10 * time.Second)
62     request, err := http.NewRequest(http.MethodGet, Url, nil)
63     GetFakeHeader(request)
64     if err != nil {
65         return nil, err
66     }
67     client := http.Client{
68         Timeout: timeout,
69     }
70     return client.Do(request)
71 }
 1 package main
 2 
 3 import (
 4     log "github.com/Sirupsen/logrus"
 5     "strconv"
 6     "regexp"
 7     "github.com/opesun/goquery"
 8     "time"
 9 )
10 
11 const (
12     SAVE_PATH = "kproxy.orz"
13     PROXY_URL = "http://www.kuaidaili.com/free/inha/"
14 )
15 var (
16     IP_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+\n\s+[\d]+`)
17     IP_DETAIL_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+`)
18     INT_REGEXP = regexp.MustCompile(`\s[\d]+`)
19 )
20 
21 func UrlGetter(num int) string {
22     return PROXY_URL + strconv.Itoa(num)
23 }
24 
25 func GetProxy(Url string) {
26     nod, err := goquery.ParseUrl(Url)
27     if err != nil {
28         log.Errorln(err.Error())
29         return
30     }
31     ret := nod.Text()
32     ips := IP_REGEXP.FindAll([]byte(ret), -1)
33     var port []string = make([]string, len(ips))
34     var str string = ""
35     for i := 0; i < len(ips); i++ {
36         port[i] = string(INT_REGEXP.FindAll(ips[i], -1)[0])[1:]
37         ips[i] = IP_DETAIL_REGEXP.FindAll(ips[i], -1)[0]
38         str += string(ips[i])+":"+port[i]+"\n"
39     }
40     AppendFile("./", SAVE_PATH, str)
41 }
42 
43 func main() {
44     log.Infoln("Start getting proxy ...")
45     SaveFile("./", SAVE_PATH, "")
46     for i := 1; i <= 500; i++ {
47         log.Println(UrlGetter(i))
48         GetProxy(UrlGetter(i))
49         time.Sleep(time.Second*5)
50     }
51 }

 

 

这里有个python来测试:

 1 import urllib
 2 import urllib2
 3 import os
 4 import socket
 5 
 6 AIM_URL = ‘‘
 7 PROXY_PATH = ./kproxy.orz
 8 
 9 class MyException(Exception):
10     pass
11 
12 
13 def read_file(path):
14     if not os.path.exists(path):
15         print path : \‘+ path + \‘ not find.
16         return []
17     content = ‘‘
18     try:
19         with open(path, r) as fp:
20             content += reduce(lambda x,y:x+y, fp)
21     finally:
22         fp.close()
23     return content.split(\n)
24 
25 socket.setdefaulttimeout(5)
26 proxies = read_file(PROXY_PATH)
27 print len(proxies)
28 for pp in proxies:
29     try:
30         print http://+pp
31         inforMation = urllib.urlopen(AIM_URL, proxies={http: http://+pp})
32     except urllib2.URLError, e:
33         if isinstance(e.reason, socket.timeout):
34             pass
35     except:
36         pass
37     finally:
38         pass

 

[Golang] 一个简易代理池