首页 > 代码库 > php原生态生成静态缓存页,定时更新

php原生态生成静态缓存页,定时更新

	public function insertAction() {

		ini_set(‘max_execution_time‘, ‘0‘);
		// error_reporting(E_ALL);
		// ini_set(‘display_errors‘, ‘Off‘);
		// 插入之前首先更新目录文件
		$getHomeList = $this->getXmlAction();

		$arr_code = array(
			1 => ‘插入成功‘,
			-1 => ‘插入失败!请检查再试!‘,
			-2 => ‘获取xml文件失败!请检查再试!‘,
		);

		showApiCode($arr_code);

		//把目录改成对应的ID
		$getHomeList = array_combine(array_column($getHomeList, ‘name‘), array_column($getHomeList, ‘id‘));
		// 添加颜色字段
		$color = array(
			0 => ‘#a56d57‘,
			1 => ‘#4c889c‘,
			2 => ‘#658965‘,
		);

		//连接数据库
		$ArticleModel = new ArticleModel();

		//创建dom对象
		$dom = new DOMDocument();
		//创建抓取对象
		$Utils_CaptureWebContent = new Utils_CaptureWebContent(‘‘);

		//加载xml.rss文件
		// $xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘));
		$xml = json_decode(file_get_contents(DATA_DIR . ‘infomation.json‘), true);

		foreach ($xml as &$value) {
			// 获取标题
			$title = $value[‘title‘];
			// 获取描述
			$summary = $value[‘description‘];
			//获取分类名字
			$category_name = $value[‘category‘];

			$send_time = strtotime($value[‘pubDate‘]);
			$utime = $ctime = time();
			//添加一级分类id
			$article_category = $getHomeList[$category_name];
			$category_color = $color[$article_category % 3];

			// 测试的链接
			$content_url = $value[‘link‘];
			$id = sprintf("%u", crc32($content_url));
			$out = $this->getDataAction($content_url);
			$out = preg_replace(array(‘/<head>([\s\S]+?)<\/head>/i‘), array(‘<head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"></head>‘), $out[‘output‘]);

			@$dom->loadHTML($out);
			$xpath = new DOMXPath($dom);
			// 截取最后一次/后面的字符,根据这个长度来判断属于哪一个类型
			$str = strlen(strrchr($content_url, ‘/‘));

			$html = $Utils_CaptureWebContent->captureGet($content_url);

			$html = $Utils_CaptureWebContent->formatHtml($html);

			// 对网站进行分类,分三类,分别处理,获取其中的from_site,content,category_name(二级分类)
			if ($str < 2) {
				// 第一类(非标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/dqhjbh/jcgfffbz/

			} else if ($str < 10) {
				// 第二类(标准链接):http://www.gdczepb.gov.cn/detail/24441
				$site = $xpath->query("//div[@class=‘cdaylist‘]/ul/li");
				//获取来源地址
				$from_site = $site->item(0)->nodeValue;
				if (strlen(trim($from_site)) < 10) {
					$from_site = ‘来源:资讯‘;
				}
				// 获取二级分类外面的那个div
				$cate_html = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "cnav", $html);
				$cate_html = preg_replace(‘/ /‘, ‘‘, $cate_html);
				$category_name = substr($cate_html, strripos($cate_html, ‘>‘) + 1);
				if (!$category_name) {
					//如果上面获取不到,则说明二级分类在a标签里面,获取最后一个a标签里面的内容
					$cate_name = $xpath->query("//div[@class=‘cnav‘]/a");
					$category_name = $cate_name->item($cate_name->length - 1)->nodeValue;
				}
				//获取内容
				$content = $Utils_CaptureWebContent->matchHtmlElement("div", "class", "contents", $html);

			} else {
				// 第三类(次标准链接):http://kjs.mep.gov.cn/hjbhbz/bzwb/stzl/201109/t20110919_217415.htm
				//获取来源地址,没有数据,直接指定来源为科技司
				$from_site = ‘来源:科学技术司‘;
				$content = $Utils_CaptureWebContent->matchAllHtmlElement("table", "class", "txtnormal", $html);
				$content = join($content[0], ‘‘);
				$category_name = $Utils_CaptureWebContent->matchAllHtmlElement("a", "class", "dtdir12 CurrChnlCls", $html);
				$category_name = $category_name[1][3];

			}

			//内容里面的图片也有多种src,
			//第一种: upload ;
			//第二种:/upload ;
			//第三种:./upload/文件名;
			//第四种:直接文件;
			//第五种:./文件名 这种;
			//正则匹配href和src
			$src_pat = ‘/src="http://www.mamicode.com/(/.?/?upload.+?)"/‘;
			$href_pat = ‘/href="http://www.mamicode.com/(/.?/?upload.+?)"/‘;

			// 获取前缀
			$host = parse_url($content_url);
			$host = ‘http://‘ . $host[‘host‘] . ‘/‘;

			$host_name = dirname($content_url) . ‘/‘;

			$content = preg_replace(array("/style=\".+?\"/i", "/width=\".+?\"/i", "/<style([\s\S]+?)<\/style>/i", "/<script([\s\S]+?)<\/script>/i"), ‘‘, $content); //去除样式

			// $content = preg_replace(array($src_pat, $href_pat), array($host . "$1", $host . "$1"), $content);
			$content = preg_replace(array($src_pat, $href_pat), array(‘src="http://www.mamicode.com/‘ . $host ."$1" . ‘"‘, ‘href="http://www.mamicode.com/‘ . $host ."$1" . ‘"‘), $content);

			$src_pat2 = ‘/src="http://www.mamicode.com/([^http].*?)"/is‘;
			$href_pat2 = ‘/href="http://www.mamicode.com/([^http].*?)"/is‘;

			// 第二次替换,把非http开头的都加上detail替换掉
			$content = preg_replace(array($src_pat2, $href_pat2), array(‘src="http://www.mamicode.com/‘ . $host_name ."$1" . ‘"‘, ‘href="http://www.mamicode.com/‘ . $host_name ."$1" . ‘"‘), $content);

			$src_one = ‘/<img[^>]*src="http://www.mamicode.com/([^>"]*)"/is‘;
			preg_match($src_one, $content, $cover_url);

			$old_data = http://www.mamicode.com/$ArticleModel->getItem($id);"%u", crc32(join(‘‘, array($title, $content))));

			$params = array(
				"id" => $id,
				"link" => $content_url,
				"article_category" => $article_category,
				"title" => $title,
				"summary" => $summary,
				"content" => $content,
				"send_time" => $send_time,
				"from_site" => $from_site,
				"ctime" => $ctime,
				"utime" => $utime,
				"category_name" => $category_name,
				"category_color" => $category_color,
				"cover_url" => $cover_url[1],
				"data_check" => $data_check,
			);
			// echo "<pre>";
			// print_r($params);
			// echo "</pre>";

			$i = 0;
			if (!empty($content)) {
				try {
					$ArticleModel->add($params);
					echo $i;
				} catch (Exception $e) {
					$old_check = $old_data[‘data_check‘];
					if ($old_check != $data_check) {
						$ArticleModel->update($params, " id = {$id} ");
						$i++;
						// echo ‘<h1 color="red">插入的数据与之前的不样!执行更新操作。</h1><br>‘;
					}
				}
			} else {
				continue;
			}

			$need = array(
				‘title‘ => $title,
				‘content‘ => htmlspecialchars_decode($content),
				‘from_site‘ => $from_site,
				‘send_time‘ => $send_time,
			);

			$data_test = array(
				‘info‘ => $need,
			);

			ob_start();
			$this->display("/article/infoContent.phtml", $data_test);
			$id_html = ob_get_clean();

			file_put_contents(PROJECT_ROOT . ‘/html/article/a‘ . $id . ‘.html‘, $id_html);

			usleep(700000);
		}
		printf("本次更新了 %s 条数据", $i);
	}

  php原生态生成静态缓存,配合crontab定时刷新缓存,不需要第三方模板

php原生态生成静态缓存页,定时更新