首页 > 代码库 > celery的使用

celery的使用

1.celery的任务调度

# -*- coding: utf-8 -*-
import threading

from bs4 import BeautifulSoup
from tornado import httpclient
from celery import Celery
from tornado.httpclient import HTTPClient

broker = ‘redis://localhost:6379‘
backend = ‘redis://localhost:6379‘

app = Celery(‘tasks‘, broker=broker, backend=backend)

visited = {}


@app.task
def get_html(url):
    http_client = HTTPClient()
    try:
        response = http_client.fetch(url, follow_redirects=True)
        return response.body
    except httpclient.HTTPError as e:
        return None
    finally:
        http_client.close()


def start(url):
    threads = []
    for i in range(20):
        t = threading.Thread(target=schedule, args=(url,))
        t.daemon = True
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()


def process_html(url, html):
    print url + ": " + html
    _add_links_to_queue(url, html)


def schedule(url):
    print "before call _work " + url
    _worker.delay(url)
    print "after call _work " + url


def _add_links_to_queue(url, html):
    soup = BeautifulSoup(html)
    links = soup.find_all(‘a‘)
    for link in links:
        try:
            _url = link[‘href‘]
        except:
            pass

        if not _url.startswith(‘http‘):
            _url = ‘http://‘ + _url
        print url + "==>" + _url
        schedule(_url)


@app.task
def _worker(url):
    print str(threading.currentThread()) + " running " + url
    while 1:
        if url in visited:
            continue
        result = get_html.delay(url)
        try:
            html = result.get(timeout=5)
        except Exception as e:
            print(url)
            print(e)
        finally:
            process_html(url, html)
            visited[url] = True


if __name__ == ‘__main__‘:
    start("http://www.hao123.com/")

  

2.celery如何进行负载均衡设计

celery有send_task方式去做任务调度,因此,负载均衡的话,可以采用自己的算法去做任务分配,可参考:http://blog.csdn.net/vintage_1/article/details/47664187

celery的使用