首页 > 代码库 > 批量去重URL地址并剔除打不开网址

批量去重URL地址并剔除打不开网址

#coding=utf-8import osimport httplibimport socketdictlist ={};def ReadHost():    hosts = [];    obn = open(‘d:/sss.txt‘, ‘rb‘);    for line in obn:        #sometime you should filter \r\n        line = line.strip(‘\n‘)        hosts.append(line)    obn.close();    return hosts;def GetWebStatus(host):    try:        conn = httplib.HTTPConnection(host)        conn.request(‘GET‘,‘url‘)        result = conn.getresponse()        resultStatus= result.status       # print(host,resultStatus)        conn.close()        if(resultStatus != 200):            return 0        else:            return 1    except httplib.HTTPException,e:        return 0def SysDNS():    hosts = ReadHost();    for host in hosts:        #print(host)        try:            if (GetWebStatus(host) == 0):                continue            myaddrs = socket.getaddrinfo(host,None)            for eachaddr in myaddrs:                addrs = eachaddr[4][0]                #print((addrs))                if(dictlist.has_key(addrs)):                    break;                else:                    dictlist[addrs] = host;                    #print(host)                    break;        except socket.herror,e:            continue;        except socket.gaierror,e1:            continue;        except Exception as e2:            print(e2)            continuedef showDict():    fw = open("d:/out.txt","wb");    for (k,v) in dictlist.items():        #print(k,v)        fw.writelines(v);    fw.close();if __name__ == "__main__":    SysDNS();    showDict();

url可能会出现的错误:

[Errno 10060]
[Errno 10061]
[Errno 10054]
[Errno 10053]