首页 > 代码库 > re模块详解

re模块详解

 

  1 #!/usr/bin/env python  2 #-*- coding:UTF-8 -*-  3 #####################################################  4 # Author: sunfx   xingrhce@163.com  5 # Last modified:  2014/11/18  6 # Filename:  re.py  7 # Q  Q  群:  236147801  8 #####################################################  9   10 import re 11   12 #1.查找文本中的字符 13   14 pattern = this 15 text = Does this text match the pattern? 16   17 match = re.search(pattern,text) 18   19 s = match.start() 20 e = match.end() 21   22 print Found "%s"\nin "%s"\nfrom %d to %d ("%s")‘ % 23       (match.re.pattern,match.string,s,e,text[s:e]) 24   25 ‘‘‘ 26 match.re.pattern 要匹配的内容 27 match.string 匹配的字符 28 s  匹配到内容开始索引 29 d  匹配到内容结束索引 30 text[s:e] 匹配字符 31 ‘‘‘ 32   33 #2.编译表达式 34   35 regexes = [ re.compile(p) 36             for p in [this‘,that]               37 ] #把字符转换Regexobject格式 38   39   40   41 print Text: %r\n‘ % text #输出text内容 42   43 for regex in regexes: 44   45     print Seeking "%s"->‘ % regex.pattern,  #regex.pattern 要匹配的字符 46   47     if regex.search(text): #在text中搜索this or that 48   49         print match! 50   51     else: 52   53         print no match 54   55 #3.多重匹配 56   57 text = abbaaabbbbaaaaa 58   59 pattern = ab 60   61 for match in re.findall(pattern,text): 62   63     print Found: "%s"‘ % match 64   65 #findall 直接返回字符串 66   67   68 for match in re.finditer(pattern,text): 69     s = match.start() 70     e = match.end() 71     print Found "%s" at %d:%d‘ % (text[s:e],s,e) 72   73 #finditer 返回原输入文字在字符串的位置 74   75 #4.模式语法 76   77 def test_patterns(text,patterns=[]): 78   79     for pattern,desc in patterns:  80         print Pattern %r (%s) \n‘ %(pattern,desc)  81         print    %r‘ % text 82         for match in re.finditer(pattern,text): 83             s = match.start() 84             e = match.end() 85             substr = text[s:e] #匹配到的字符 86             n_backslashes = text[:s].count(\\‘) #查找文本:s坐标之前的包含多少\ 87             prefix = .‘ * ( s + n_backslashes )  88             print     %s%r‘ % (prefix,substr)  89         print 90     return 91   92 test_patterns(abbaaabbbbaaaaa, 93             [(ab‘,"‘a‘ followed by ‘b‘")] 94     ) 95   96 #贪婪模式 这种模式会减少单个匹配减少 97 ‘‘‘ 98      *                ‘匹配一次到多次‘ 99      +                ‘至少匹配一次到多次‘100      ?                ‘只匹配一次‘101      ab*,             ‘a followerd by zero or more b‘),  #匹配0次或者更多次102      ab+,             ‘a followerd by one or mrore b‘),  #最少匹配一次或者更多次103      ab?,             ‘a followerd by zero or one b‘),   #匹配0最多一次104      ab{3},           ‘a followerd by three b‘),         #最少匹配三次105      ab{2,3},           ‘a followerd by two to three b‘)   #匹配两至三次106  107  108      ab*?,             ‘a followerd by zero or more b‘),  #匹配0次或者更多次109      ab+?,             ‘a followerd by one or mrore b‘),  #最少匹配一次或者更多次110      ab??,             ‘a followerd by zero or one b‘),   #匹配0最多一次111      ab{3}?,           ‘a followerd by three b‘),         #最少匹配三次112      ab{2,3}?,           ‘a followerd by two to three b‘)   #匹配两至三次113 ‘‘‘114  115 #用法如下:116  117 str = absdsdsdsdsd118  119 print re.findall(ab*,str)120 #[‘ab‘]121  122 print re.findall(ab*?,str)123 #[‘a‘]124  125 #5.字符集126  127 ‘‘‘128 [ab]     ‘either a or b 匹配a或者b‘129 a[ab]+   ‘a followerd by 1 more a or b 匹配一次a、b或者多次 ‘130 a[ab]+?  ‘a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次‘131 [^]      ‘不包含内容‘132 [a-z]    ‘所有小写ASCII字母‘ 133 [A-Z]    ‘所有大写写ASCII字母‘ 134 [a-zA-Z] ‘一个小写和大写的序列‘135 [A-Za-z] ‘一个大写小写的序列‘136 ‘‘‘137 str =aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba138  139 print re.findall([ab],str)140 print re.findall(a[ab]+,str)141 print re.findall(a[ab]+?,str)142 print re.findall([^_],str)143  144 str = China,lovE145  146 print re.findall([a-z][A-Z]‘,str)  #[‘vE‘] 147 print re.findall([A-Z][a-z]‘,str)  #[‘Ch‘]148  149 print re.findall([A-Z][a-z]+‘,str) #[‘China‘]150 print re.findall([a-z][A-Z]+‘,str) #[‘vE‘]151  152 print re.findall([A-Z][a-z]*‘,str) #[‘China‘, ‘E‘]153 print re.findall([a-z][A-Z]*‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]154  155 print re.findall([A-Z][a-z]?‘,str) #[‘Ch‘, ‘E‘]156 print re.findall([a-z][A-Z]?‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]157  158 ‘‘‘159 .      元字符匹配一个字符160 a.161 b.162 a.*b163 a.*?b164 ‘‘‘165  166 c = woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd167  168 print re.findall(a.‘,c)  #[‘ai‘, ‘aw‘, ‘as‘, ‘aa‘, ‘ab‘]169 print re.findall(b.‘,c)  #[‘b,‘, ‘bs‘, ‘ba‘, ‘bb‘, ‘bb‘, ‘bb‘, ‘bs‘]170 print re.findall(a.*b‘,c)  #[‘aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb‘] #贪婪模式匹配a到b之间的任意字符长度字符171 print re.findall(a.*?b‘,c)  #[‘aizhongguoawsb‘, ‘asssssssssssssdsdsdsdb‘, ‘aaab‘] # ?结束了* 的贪婪模式,172                              #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符173  174  175 #6.转义码176  177 ‘‘‘178 转义码                                   含义179  \d                                    一个数字180  \D                                    一个非字符181  \s                                    空白符(制表符、空格、换行符)182  \S                                    非空白符(符号、字母、数字)183  \w                                    字母数字184  \W                                    非字母数字(符号、制表符、空格、换行符)185 ‘‘‘186  187 #7.锚定188  189 ‘‘‘190 锚定码                               含义191   ^                              字符串或行的开始192   $                              字符串或行结束193   \A                             字符串开始194   \Z                             字符串结束195   \b                             一个单词开头或者末尾的空串196   \B                             不在一个单词的开头活末尾的空串197 ‘‘‘198 #8.限制搜索 match、search199  200 text = This is some text --with punctuation.201  202 pattern = is203  204 print Text    :,text205 print pattern:,pattern206  207 m = re.match(pattern,text)   #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.208 print Match :,m   209  210 s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容211 print Search :,s212  213 pattern = re.compile(r\b\w*is\w*\b‘) #编译规则214  215 print Text:,text216  217  218 pos = 0219 while  True:220     match = pattern.search(text,pos) #搜索规则221     if not match:222         break223     s = match.start()224     e = match.end() 225     print   %d : %d = "%s"‘ % (s,e-1,text[s:e]) 226     pos = e227  228 #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)229 regex = re.compile(r(\bt\w+)\W+(\w+))230  231 print Input  text      :,text232  233 print Pattern          :,regex.pattern234  235 match = regex.search(text)236 print Entire match     :‘,match.group(0) #表示整个表达式的字符串,子组从1开始排序237 print World start with "t":‘,match.group(1) #匹配到的第一组238 print World after "t" word :‘,match.group(2) #匹配到的第二组239  240 #python对基本分组进行了扩展 (?P<name>pattern)241  242 print text243 print244 for pattern in [ r^(?P<first_word>\w+)‘,  #组名和正则表达式组成245                  r(?P<last_word>\w+)\S*$,246                  r(?P<t_word>\bt\w+)\W+(?P<other_word>\w+),247                  r(?P<ends_with_t>\w+t)\b,248                  ]:249     regex = re.compile(pattern)250     match = regex.search(text)251     print Matching "%s"‘ % pattern252     print ‘,match.groups()  #匹配到所有的组的值253     print ‘,match.groupdict() #把组名和字串生成字典 254     print255  256 def test_patterns(text,patterns=[]):257     ‘‘‘Given source text and a list of patterns,look for 258     matches for each pattern within the text and print259     them to stdout.260     ‘‘‘261     #look for each pattern in the text and print the resuls262  263     for pattern,desc in patterns:264         print Pattern %r (%s)\n‘ % (pattern,desc)265         print    %r‘ % text266     for match in re.finditer(pattern,text):267         s = match.start()268         e = match.end()269         prefix = ‘ * (s) #‘空格 X 次数‘270         print    %s%r%s‘ % (prefix,text[s:e],‘*(len(text)-e)),271         print match.groups()272         if match.groupdict():273             print %s%s‘ % (‘ * (len(text) -s),match,groupdict())274             print275     return276  277 print test_patterns(text,[(r(a(a*)(b*))‘,a followerd by 0-n a and 0-n b)])278  279 ‘‘‘280 |       代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则281 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式282 ?:pattern283 ‘‘‘284  285  286 #10.搜索选项 - 不区分大小写的匹配287 ‘‘‘288 re.IGNORECASE 忽略大小写289 ‘‘‘290  291 text  = This is some text  -- with punctuation.292 pattern = r\bT\w+293 with_case = re.compile(pattern)294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写295  296 print Text: \n  %r‘ % text297 print Pattern:\n %s‘ % pattern298 print Case-sensitive:299 for match in with_case.findall(text):300     print   %r‘ % match301 print Case-insensitive:302 for match in whitout_case.findall(text):303     print  %r‘ % match304  305 #11.多行输入306 ‘‘‘307 MULTILINE  多行匹配308 ‘‘‘309  310 text = This is some text  -- with punctuation.\nA secone lines.311 pattern = r(^\w+)|(\w+\S*$)312 single_line = re.compile(pattern)313 multiline = re.compile(pattern,re.MULTILINE) 314 print Text:\n %r‘ % text315 print Pattern:\n  %s‘ % pattern316 print Single Line :317 for match in single_line.findall(text):318     print   %r‘ % (match,)319 print MULTILINE  :320 for match in multiline.findall(text):321     print   %r‘  % (match,)322  323 ‘‘‘324 DOTALL 让点字符也可以匹配换行符325 ‘‘‘326  327 pattern = r.+328 no_newlines = re.compile(pattern)329 dotall = re.compile(pattern,re.DOTALL)330  331 print Text :\n   %r‘ % text332 print Pattern:\n %s‘ % pattern333 print No newlines :334 for match in no_newlines.findall(text):335     print   %r‘ % match336 print Dotall    :337 for  match in dotall.findall(text):338     print   %r‘ % match339  340 #12 Unicode匹配341 ‘‘‘342 re.UNICODE 匹配Unicode343 ‘‘‘344  345  346 import codecs347 import sys348  349 #set standard output encoding to UTF-8350  351 sys.output = codecs.getwriter(UTF-8)(sys.stdout)352  353 pattern = ur\w+354 ascii_pattern = re.compile(pattern)355 unicde_pattern = re.compile(pattern,re.UNICODE)356  357 print Text    :,text358 print Pattern :,pattern359 print ASCII   :‘,u, .join(ascii_pattern.findall(text))360 print Unicode :‘,u, .join(unicde_pattern.findall(text))361  362 ‘‘‘363 re.VERBOSE 让正则更容易读364 ‘‘‘365  366 address = re.compile(367         ‘‘‘368         [\w\d.+-]+    #username369         @ 370         ([\w\d.]+\.)+ #domain name prefix371         (com|org|edu) #TODO:support more top-level domains372         ‘‘‘,373         re.UNICODE | re.VERBOSE)374  375 candidates = [376         ufirst.last@example.com,377         ufirst.last+category@gmail.com,378         uvalid-address@mail.example.com,379         unot-valid@example.foo380 ]381  382 for candidate in candidates:383     match = address.search(candidate)384     print %-30s %s‘ % (candidate,Matcheif match else no match)385  386  387 address = re.compile (388     ‘‘‘389     #A name is made up of letters,and may include "."390     #for title abbreviations and middle initials.391     ((?P<name>392         ([\w.,]+\S+)*[\w.,]+)393         \s*394         # Email addresses are wrapped in angle395         # brackets: <> but only if a name is 396         # found, so keep the start bracket in this397         # group.398         <399     )?  # the entire name is optional400      401     # the address itself:username@domain.tld402     (?P<email>403         [\w\d.+-]+    #username404         @ 405         ([\w\d.]+\.)+ #domain name prefix406         (com|org|edu) #TODO:support more top-level domains407     )408     >? # optional closeing angle break409     ‘‘‘,410     re.UNICODE | re.VERBOSE)411  412 candidates = [413         ufirst.last@example.com,414         ufirst.last+category@gmail.com,415         uvalid-address@mail.example.com,416         unot-valid@example.foo417         uFist Last <first.last@example.com>418         uNO Brackets first.last@example,419         uFirst Last,420         uFirst Middle Last <first.last@example.com>,421         uFirst M. Last <first.last@example.com>,422         u<first.last@example.com>,423 ]424  425 for candidate in candidates:426     print candidate:,candidate427     match = address.search(candidate)428     if match:429         print  Name:‘,match.groupdict()[name]430         print  Email:‘,match.groupdict()[email]431     else:432         print    No match433  434 ‘‘‘435                     正则表达式标志缩写表436  437     标志                  缩写               描述438  439   IGNORECASE              i           忽略大小写440   MULTILINE                 m           多行匹配441   DOTALL                    s          让点字符也可以匹配换行符442   UNICODE                  u          匹配Unicode443   VERBOSE                 x          让正则更容易读444 在模式中嵌入标签(?imu)会打开相应的选项445 ‘‘‘446 text = This is  some text -- with punctuation.447 pattern = r(?i)\bT\w+448 regex = re.compile(pattern)449  450 print Text   :,text451 print Pattern    :,pattern452 print Matches   :,regex.findall(text)453  454 #13 前向或后向455  456 address = re.compile(457     ‘‘‘458     # A name is made up of letters, and may include "."459     # for title abbreviations and middle initials460     ((?P<name>461         ([\w.,]+\s+)*[\w.,]+462         )463     \s+464     )  # name is no longer optional465     # LOOKAHEAD466     # Email address are wrapped in angle brackets, but only467     # if they are both present or neither is .468     (?= (<.*>$)469         |470         ([^<].*[^>]$)471     )472     <? # optional opening angle bracket473  474     # The address itself: username@domain.tld475     (?P<email>476         [\w\d.+-]+477         @478         ([\w\d.]+\.)+479         (com|org|edu)480     )481     >?482     ‘‘‘,483     re.UNICODE | re.VERBOSE)484  485 candidates = [486     uFirst Last <first.last@example.com>,487     uNo Brackets first.last@example.com,488     uOpen Brackets <first.last@example.com>,489     uClose Brackets first.last@example.com,490     ]491 for candidate in candidates:492     print Candidate:,candidate493     match = address.search(candidate)494     if match:495         print  Name :‘,match.groupdict()[name]496         print  Email :‘,match.groupdict()[email]497     else:498         print   No match499  500 #自动忽略系统常用的noreply邮件地址501 ‘‘‘502 (?!noreply@.*$) 忽略这个邮件地址503 (?<!noreply>)  两种模式 写在username之前不会向后断言 504 (?<=pattern)   用肯定向后断言查找符合某个模式的文本 505 ‘‘‘506 address = re.compile(507     ‘‘‘508     ^509     # An address: username@domain.tld510  511     # Ignore noreply address512     (?!noreply@.*$)513  514     [\w\d.+-]+     # username515     @516     ([\w\d.]+\.)+  # domain name prefix517     (com|org|edu)  # limit the allowed top-level domains518  519     $520     ‘‘‘,521     re.UNICODE | re.VERBOSE)522  523 candidates = [524  525     ufirst.last@example.com,526     unoreply@example.com,527 ]528  529 for candidate in candidates:530     print Candidate:,candidate531     match = address.search(candidate)532     if match:533         print   Match:,candidate[match.start():match.end()]534     else:535         print   No match536  537 twitter = re.compile(538     ‘‘‘539     # A twitter handle: @username540     (?<=@)541     ([\w\d_]+)   # username542     ‘‘‘,543     re.UNICODE | re.VERBOSE)544  545 text = ‘‘‘ This text includes two Twitter handles.546 One for @TheSF,and one for the author,@doughellmann.547 ‘‘‘548 print text549 for match in twitter.findall(text):550     print handle:,match551  552 #14 自引用表达式 #可以把表达式编号后面来引用553  554 address = re.compile(555     ‘‘‘556     (\w+)          # first name557     \s+558     (([\w.]+)\s+)?  # optional middle name or initial559     (\w+)           # last name560  561     \s+562     <563  564     # The address: first_name.last_name@domain.tld565     (?P<email>566         \1         #first name567         \.568         \4         #last name569         @570         ([\w\d.]+\.)+571         (com|org|edu)572         )            573     >574     ‘‘‘,575     re.UNICODE | re.VERBOSE | re.IGNORECASE)576  577 candidates = [578     uFirst Last <first.last@example.com>,579     uDifferent Name <first.last.example.com>,580     uFirst Middle Last <first.last@example.com>, 581 ]582 for candidate in candidates:583     print Candidate:,candidate584     match = address.search(candidate)585 if match:586     print   Match name:‘,match.group(1),match.group(4)587 else:588     print  No match589  590 #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值.591  592 address = re.compile(593     ‘‘‘594  595     # The regular name596     (?P<first_name>\w+)597     \s+598     (([\w.]+)\s+)?599     (?P<last_name>\w+)600     \s+601     <602  603     # The address: first_name.last_name@domain.tld604     (?P<email>605         (?P=first_name)606         \.607         (?P=last_name)608         @609         ([\w\d.]+\.)+610         (com|org|edu)611         )612     >613     ‘‘‘,614     re.UNICODE | re.VERBOSE | re.IGNORECASE)615  616 candidates = [617     uFirst last <first.last@example.com>,618     uDifferent Name <first.last@example.com>,619     uFirst Middle last <first.last@example.com>,620     uFirst M. Last<first.last@example.com>,621 ]622  623 for candidate in candidates:624     print Candidate:,candidate625     match = address.search(candidate)626     if match:627         print   Match name:‘,match.groupdict()[first_name]628         print match.groupdict()[last_name]629         print   Match email:‘,match.groupdict()[email]630  631     else:632         print No match633  634 #15 用模式修改字符串635 ‘‘‘636 re支持使用正则表达式作为搜索机制来修改文本,而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。637 ‘‘‘638 bold = re.compile(r\*{2}(.*?)\*{2})639 text = Make this **bold**. This **too**.640 print Text:,text641 print Bold:‘,bold.sub(r<b>\1</b>,text)642  643 ‘‘‘644 使用命名组来替换645 count 来限制替换次数646 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数647 ‘‘‘648  649 bold = re.compile(r\*{2}(?P<bold_text>.*?)\*{2},re.UNICODE,)650  651 print Text:,text652 print Bold:‘,bold.sub(r<b>\g<bold_text></b>‘,text,count=1)653  654 #16 利用模式拆分655  656 ‘‘‘657 str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符658 ‘‘‘659  660 text = ‘‘‘Paragraph one661 one tuo lines.662  663 Paragraph two.664  665 Paragraph three.‘‘‘666  667 print With findall:668 for num,para in enumerate(re.findall(r.+?\n{2,}|$,669                                     text,670                                     flags = re.DOTALL)671                             ):672     print num,repr(para)673     print674  675 print 676 print With split:677 for num,para in enumerate(re.split(r\n{2,},text)):678     print num,repr(para)679     print

 

re模块详解