首页 > 代码库 > python单线程爬取阿里云maven库

python单线程爬取阿里云maven库

requests
re
os

COMPILE = re.compile()
URL = ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
CASE_INSENSE = []

():
    (urlcodereasoncontent):
        (MavenException).()
        .url = url
        .code = code
        .reason = reason
        .content = content


(url=URL):
    result = requests.get(url)
    code = result.status_code
    code != requests.codes.ok:
        MavenException(urlcoderesult.reasonresult.content)
    COMPILE.findall(result.content)


():
    (url):
    result = {}
    disk_path = url.rsplit()[]
    is_dir = disk_path.endswith()
    result[] = disk_path.split()[].replace(os.sep)
    result[] = is_dir
    result


(disk_path):
    os.mkdir(ROOT_DIR + disk_path)


(urldisk_path):
    result = requests.get(url)
    (ROOT_DIR + disk_path) f:
        f.write(result.content)


(url):
    resource = handle_resource_type(url)
    urlresource
    resource[]:
        down(urlresource[])
    :

        :
            make_dir(resource[])
            e:
            e.winerror == :
                CASE_INSENSE.append(resource[])
                make_dir(resource[].rstrip()++((CASE_INSENSE))+)
            :
                e
        urls = get_urls(url)
        urls
        [u u urls handle_resource_type(u)[]]:
            item urls:
                parse_url(item)
            :
            item urls:
                parse_url(item)


__name__ == :
    url get_urls():
        parse_url(url)


下一步目标使用线程池, 单线程太慢了...

从17:00~第二天1:00爬了1/3,中间还跪了...

还要加入日志和容错处理..

http://xiaorui.cc/2014/11/15/%E4%BD%BF%E7%94%A8python%E7%9A%84%E4%B8%8A%E5%B1%82%E5%B0%81%E8%A3%85%E5%B9%B6%E5%8F%91%E5%BA%93concurrent-futures%E5%AE%9E%E7%8E%B0%E5%BC%82%E6%AD%A5/

python单线程爬取阿里云maven库