8.7. Syntax Group¶
Catch expression results
Can be named or positional
Note, that for backreference, must use raw-sting or double backslash
8.7.1. Syntax¶
()
- matches whatever regular expression is inside the parentheses, and indicates the start and end of a group(...)
- unnamed group(?P<mygroup>...)
- named group mygroup(?:...)
- non-capturing group(?#...)
- comment
8.7.2. Positional Group¶
(...)
- unnamed (positional) group
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> re.findall(r'\dth', TEXT)
['7th']
>>>
>>> re.findall(r'(\d)th', TEXT)
['7']
>>>
>>> re.findall(r'\d(th)', TEXT)
['th']
>>> re.findall('\d:\d\d', TEXT)
['1:37']
>>> re.findall('(\d):\d\d', TEXT)
['1']
>>> re.findall('\d:(\d\d)', TEXT)
['37']
>>> re.findall('(\d):(\d\d)', TEXT)
[('1', '37')]
>>> re.findall(r'([A-Z][a-z]+\s[A-Z][a-z]+)', TEXT)
['Mark Watney']
>>>
>>> re.findall(r'([A-Z][a-z]+) ([A-Z][a-z]+)', TEXT)
[('Mark', 'Watney')]
>>>
>>> re.findall(r'([A-Z][a-z]+) ([A-Z][a-z]+)', TEXT)[0]
('Mark', 'Watney')
>>> firstname = r'([A-Z][a-z]+)'
>>> lastname = r'([A-Z][a-z]+)'
>>>
>>> re.findall(f'{firstname} {lastname}', TEXT)[0]
('Mark', 'Watney')
>>> firstname = r'([A-Z][a-z]+)'
>>> lastname = r'([A-Z][a-z]+)'
>>> name = f'{firstname} {lastname}'
>>>
>>> re.findall(name, TEXT)[0]
('Mark', 'Watney')
>>> firstname = r'[A-Z][a-z]+'
>>> lastname = r'[A-Z][a-z]+'
>>> name = f'({firstname}) ({lastname})'
>>>
>>> re.findall(name, TEXT)[0]
('Mark', 'Watney')
8.7.3. Named Group¶
(?P<mygroup>...)
- named group mygroup
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> firstname = r'[A-Z][a-z]+'
>>> lastname = r'[A-Z][a-z]+'
>>> name = f'(?P<firstname>{firstname}) (?P<lastname>{lastname})'
>>>
>>> re.findall(name, TEXT)
[('Mark', 'Watney')]
>>>
>>> re.search(name, TEXT)
<re.Match object; span=(0, 11), match='Mark Watney'>
>>>
>>> re.search(name, TEXT).groups()
('Mark', 'Watney')
>>>
>>> re.search(name, TEXT).groupdict()
{'firstname': 'Mark', 'lastname': 'Watney'}
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> time = '(?P<hour>\d{1,2}):(?P<minute>\d{1,2})'
>>>
>>> re.findall(time, TEXT)
[('1', '37')]
>>>
>>> re.search(time, TEXT).groups()
('1', '37')
>>>
>>> re.search(time, TEXT).group(0)
'1:37'
>>>
>>> re.search(time, TEXT).group(1)
'1'
>>>
>>> re.search(time, TEXT).group(2)
'37'
>>>
>>> re.search(time, TEXT).groupdict()
{'hour': '1', 'minute': '37'}
8.7.4. Non-Capturing Group¶
(?:...)
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> re.findall('\w{3} \d{1,2}th, \d{4}', TEXT)
['Nov 7th, 2035']
>>>
>>> re.findall('\w{3} \d{1,2}st|nd|rd|th, \d{4}', TEXT)
['nd', 'th, 2035']
>>>
>>> re.findall('\w{3} \d{1,2}(st|nd|rd|th), \d{4}', TEXT)
['th']
>>>
>>> re.findall('\w{3} \d{1,2}(?:st|nd|rd|th), \d{4}', TEXT)
['Nov 7th, 2035']
>>>
>>> re.findall('(\w{3}) (\d{1,2})(?:st|nd|rd|th), (\d{4})', TEXT)
[('Nov', '7', '2035')]
>>>
>>> re.findall('(\w{3}) (\d{1,2})(st|nd|rd|th), (\d{4})', TEXT)
[('Nov', '7', 'th', '2035')]
>>> date = r'(\w{3} \d{1,2}(?:st|nd|rd|th), \d{4})'
>>> re.findall(date, TEXT)
['Nov 7th, 2035']
>>> year = '\d{4}'
>>> month = '\w{3}'
>>> day = '\d{1,2}'
>>>
>>> re.findall(f'{month} {day}(st|nd|rd|th), {year}', TEXT)
['th']
>>>
>>> re.findall(f'{month} {day}(?:st|nd|rd|th), {year}', TEXT)
['Nov 7th, 2035']
8.7.5. Comment¶
(?#...)
- comment
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> re.findall(r'\d{4}(?#year)', TEXT)
['2035']
>>>
>>> re.findall('\d{1,2}(?#hour):\d{2}(?#minute)', TEXT)
['1:37']
>>> hour = '\d{1,2}(?#hour)'
>>> minute = '\d{2}(?#minute)'
>>> time = f'{hour}:{minute}'
>>>
>>> re.findall(time, TEXT)
['1:37']
>>>
>>> time
'\\d{1,2}(?#hour):\\d{2}(?#minute)'
8.7.6. Backreference¶
\g<number>
- backreferencing by group number\g<name>
- backreferencing by group name(?P=name)
- backreferencing by group name\number
- backreferencing by group number
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> year = '(?P<year>\d{4})'
>>> month = '(?P<month>\w+)'
>>> day = '(?P<day>\d{1,2})'
>>>
>>> re.sub(f'{month} {day}th, {year}', '\g<day> \g<month> \g<year>', TEXT)
'Mark Watney of Ares 3 landed on Mars on: 7 Nov 2035 at 1:37 pm'
Although this is not working in Python:
>>> re.sub(f'{month} {day}th, {year}', '(?P=day) (?P=month) (?P=year)', TEXT)
'Mark Watney of Ares 3 landed on Mars on: (?P=day) (?P=month) (?P=year) at 1:37 pm'
8.7.7. Examples¶
(\w+)
- word character (including unicode chars, numbers an underscores)\d+(\.\d+)?
- float with optional decimals\d+(,\d+)?
- number with coma (,
) as thousands separator(?P<word>\w+)
- name group word with\w+
with at least one word character (including unicode chars, numbers an underscores)(?P<tag><.*?>).+(?P=tag)
- matches text inside of a<tag>
(opening and closing tag is the same)(.+) \1
- matchesthe the
or55 55
(.+) \1
- not matchesthethe
(note the space after the group)
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> re.findall(r'\d{,2}(st|nd|rd|th)?', TEXT)
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', 'nd', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', 'th', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '']
>>>
>>> re.findall(r'\d{1,2}(st|nd|rd|th)?', TEXT)
['', 'th', '', '', '', '']
>>>
>>> re.findall(r'\d{1,2}(st|nd|rd|th)+?', TEXT)
['th']
>>>
>>> re.findall(r'\d{1,2}st|nd|rd|th+?', TEXT) # nd is also in word `landed`
['nd', 'th']
>>>
>>> re.findall(r'\d{1,2}(?:st|nd|rd|th)+?', TEXT)
['7th']
>>>
>>> re.findall(r'(\d{1,2})(st|nd|rd|th)+?', TEXT)
[('7', 'th')]
>>>
>>> re.findall(r'(\d{1,2})(?:st|nd|rd|th)+?', TEXT)
['7']
>>>
>>> re.findall(r'(\w{3}) (\d{1,2})(?:st|nd|rd|th)+?, (\d{4})', TEXT)
[('Nov', '7', '2035')]
>>>
>>> re.findall(r'(\w{3}) (\d{1,2})(?:st|nd|rd|th)+?, (\d{4})', TEXT)[0]
('Nov', '7', '2035')
>>>
>>> re.findall(r'(\w{3} \d{1,2}(?:st|nd|rd|th)+?, \d{4})', TEXT)
['Nov 7th, 2035']
8.7.8. Use Case - 0x01¶
Dates
>>> import re
>>> TEXT = 'Mark Watney of Ares 3 landed on Mars on: Nov 7th, 2035 at 1:37 pm'
>>> year = r'(?P<year>\d{4})'
>>> month = r'(?P<month>\w{3})'
>>> day = r'(?P<day>\d{1,2}(?:st|nd|rd|th)+?)'
>>> date = f'{month} {day}, {year}'
>>>
>>> re.search(date, TEXT).groupdict()
{'month': 'Nov', 'day': '7th', 'year': '2035'}
8.7.9. Use Case - 0x02¶
>>> import re
>>> line = 'value=123'
>>>
>>> re.findall(r'(\w+)\s?=\s?(\d+)', line)
[('value', '123')]
>>> line = 'value = 123'
>>>
>>> re.findall(r'(\w+)\s?=\s?(\d+)', line)
[('value', '123')]
8.7.10. Use Case - 0x03¶
>>> import re
>>>
>>>
>>> variable = '(?P<variable>\w+)'
>>> space = '\s?' # optional space
>>> value = '(?P<value>.+)'
>>> assignment = f'^{variable}{space}={space}{value}$'
>>>
>>> line_of_code = 'myvar = 123'
>>> re.findall(assignment, line_of_code)
[('myvar', '123')]
8.7.11. Use Case - 0x04¶
>>> import re
>>>
>>>
>>> variable = '(?P<variable>\w+)'
>>> space = '\s?(?#optional space)'
>>> value = '(?P<value>.+)'
>>> assignment = f'^{variable}{space}={space}{value}$'
>>>
>>> assignment
'^(?P<variable>\\w+)\\s?(?#optional space)=\\s?(?#optional space)(?P<value>.+)$'
8.7.12. Use Case - 0x05¶
>>> import re
>>>
>>>
>>> HTML = '<p>Hello World</p>'
>>>
>>> search = '<p>(.+)</p>'
>>> replace = '<strong>\g<1></strong>'
>>>
>>> re.sub(search, replace, HTML)
'<strong>Hello World</strong>'
8.7.13. Use Case - 0x06¶
>>> import re
>>>
>>>
>>> HTML = '<p>Hello World</p>'
>>>
>>> search = '<p>(?P<text>.+)</p>'
>>> replace = '<strong>\g<text></strong>'
>>>
>>> re.sub(search, replace, HTML)
'<strong>Hello World</strong>'
8.7.14. Use Case - 0x07¶
>>> import re
>>>
>>>
>>> HTML = '<p>Hello World</p>'
>>> tag = re.findall('<(?P<tag>.+)>(?:.+)</(?P=tag)>', HTML)
>>>
>>> tag
['p']
8.7.15. Use Case - 0x08¶
>>> import re
>>>
>>>
>>> HTML = '<p>Hello World</p>'
>>>
>>> re.findall('<(?P<tag>.*?)>(.*?)</(?P=tag)>', HTML)
[('p', 'Hello World')]