首页 > 代码库 > Markup添加标记

Markup添加标记

本文内容来自《python基础教程第二版》上的项目。

Markup要做的就是未纯文本添加一些格式。以一个文本文件作为输入,然后再浏览器中查看输出的结果或者直接检查新增的标签。

首先,我们要做的是将文本分块,具体是不断地读入行直到遇到空行,再将前面的所有行加在一起算一个块。

util.py 两个工具性的函数,第一个只是在文件的末尾加了一个空行,为了能让block函数找到最后一个块的结束标志。

def lines(file):	‘‘‘	Add one blank line at the end of file as a mark of the last block.		‘‘‘	for line in file:		yield line	yield ‘\n‘def blocks(file):	‘‘‘	Divide a file into blocks.	‘‘‘	block = []	for line in lines(file):		if line.strip():			block.append(line)		elif block:			yield ‘‘.join(block).strip()			block = []

后面的handler,rule,和主程序markup我好解释,可能是自己理解上还不到位。

handler.py用来处理具体的标记添加和文本替换。

class Handler:	‘‘‘	An object that handles method calls from the Parser.    The Parser will call the start() and end() methods at the    beginning of each block, with the proper block name as a    parameter. The sub() method will be used in regular expression    substitution. When called with a name such as ‘emphasis‘, it will    return a proper substitution function.	‘‘‘	def callback(self, prefix, name, *args):		method = getattr(self, prefix+name, None)		if callable(method):			return method(*args)	def start(self, name):		self.callback(‘start_‘, name)	def end(self, name):		self.callback(‘end_‘, name)	def sub(self, name):		def substitution(match):			result = self.callback(‘sub_‘, name, match)			if result is None:				match.group(0)			return result		return substitutionclass HTMLRenderer(Handler):	‘‘‘	A specific handler used for rendering HTML.    The methods in HTMLRenderer are accessed from the superclass    Handler‘s start(), end(), and sub() methods. They implement basic    markup as used in HTML documents.	‘‘‘	def start_document(self):		print ‘<html><head><title>...</title></head><body>‘	def end_document(self):		print ‘</body></html>‘	def start_paragraph(self):		print ‘<p>‘	def end_paragragh(self):		print ‘</p>‘	def start_heading(self):		print ‘<h2>‘	def end_heading(self):		print ‘</h2>‘	def start_list(self):		print ‘<ul>‘	def end_list(self):		print ‘</ul>‘	def start_listitem(self):		print ‘<li>‘	def end_listitem(self):		print ‘</li>‘	def start_tile(self):		print ‘<h1>‘	def end_tile(self):		print ‘</h1>‘	def sub_emphasis(self, match):		return ‘<em>%s</em>‘ % match.group(1)	def sub_url(self, match):		return ‘<a href="http://www.mamicode.com/%s">%s</a>‘ % (match.group(1), match.group(1))	def sub_mail(self, match):		return ‘<a href="mailto:%s">%s</a>‘ % (match.group(1), match.group(1))	def feed(self, data):		print data

rules.py 用来识别文本块中的标题,段落,列表等格式。

class Rule:	‘‘‘	Base class for all rules.	‘‘‘	def action(self, block, handler):		handler.start(self.type)		handler.feed(block)		handler.end(self.type)		return Trueclass HeadingRule(Rule):	‘‘‘	A heading is a single line that is at most 70 characters and    that doesn‘t end with a colon.	‘‘‘	type = ‘heading‘	def condition(self, block):		return not ‘\n‘ in block and len(block) <= 70 and not block[-1] == ‘:‘class TitleRule(HeadingRule):	‘‘‘	The title is the first block in the document, provided that it is    a heading.	‘‘‘	type = ‘title‘	first = True	def condition(self, block):		if not self.first:			return False		self.first = False		return HeadingRule.condition(self, block)class ListItemRule(Rule):	‘‘‘	A list item is a paragraph that begins with a hyphen. As part of    the formatting, the hyphen is removed.	‘‘‘	type = ‘listitem‘	def condition(self, block):		return block[0] == ‘-‘	def action(self, block, handler):		handler.start(self.type)		handler.feed(block[1:].strip())		handler.end(self.type)		return Trueclass ListRule(ListItemRule):	‘‘‘	A list begins between a block that is not a list item and a    subsequent list item. It ends after the last consecutive list    item.	‘‘‘	type = ‘list‘	inside = False	def condition(self, block):		return True	def action(self, block, handler):		if not self.inside and ListItemRule.condition(self, block):			handler.start(self.type)			self.inside = True		elif self.inside and not ListItemRule.condition(self, block):			handler.end(self.type)			self.inside = False		return Falseclass ParagraphRule(Rule):	‘‘‘	 A paragraph is simply a block that isn‘t covered by any of the    other rules.	‘‘‘	type = ‘paragraph‘	def condition(self, block):		return True

markup主程序。一个简单的文本分析器。

import sys, refrom handlers import *from util import *from rules import *class Parser:	‘‘‘	A Parser reads a text file, applying rules and controlling a handler.	‘‘‘	def __init__(self, handler):		self.handler = handler		self.rules = []		self.filters = []	def addRule(self, rule):		self.rules.append(rule)	def addFilter(self, pattern, name):		def filter(block, handler):			return re.sub(pattern, handler.sub(name), block)		self.filters.append(filter)	def parse(self, file):		self.handler.start(‘document‘)		for block in blocks(file):			for filter in self.filters:				block = filter(block, self.handler)			for rule in self.rules:				if rule.condition(block):					last = rule.action(block, self.handler)					if last:						break		self.handler.end(‘document‘)class BasicTextParser(Parser):	‘‘‘	A specific Parser that adds rules and filters in its    constructor.	‘‘‘	def __init__(self, handler):		Parser.__init__(self, handler)		self.addRule(ListRule())		self.addRule(ListItemRule())		self.addRule(TitleRule())		self.addRule(HeadingRule())		self.addRule(ParagraphRule())		self.addFilter(r‘\*(.+?)\*‘, ‘emphasis‘)		self.addFilter(r‘(http://[\.a-zA-Z/]+)‘, ‘url‘)		self.addFilter(r‘([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)‘, ‘mail‘)if __name__ == ‘__main__‘:	handler = HTMLRenderer()	parser = BasicTextParser(handler)	parser.parse(sys.stdin)

  

 

Markup添加标记