Python3 – 正規表現 – モジュール定数

ここでは正規表現の操作に使われるモジュール定数を整理する。

re.A/re. ASCII

\b、\B、\d、\D、\s、\S、\w、\Wにおいて、ASCII文字のみでマッチングを行う。Unicodeパターンでのみ意味があり、バイト列パターンでは無視される。

import re

ptn = r'\b\w+\b'
regex_obj = re.compile(ptn)
regex_obj_ascii = re.compile(ptn, flags=re.ASCII)

s = "プログラミング言語Python"
print(regex_obj.findall(s))
# ['プログラミング言語Python']
print(regex_obj_ascii.findall(s))
# ['Python']

s = "プログラミング言語　Python"
print(regex_obj.findall(s))
# ['プログラミング言語', 'Python']
print(regex_obj_ascii.findall(s))
# ['Python']

import re

ptn = r'\b\w+\b'

regex_obj = re.compile(ptn)

regex_obj_ascii = re.compile(ptn, flags=re.ASCII)

s = "プログラミング言語Python"

print(regex_obj.findall(s))

# ['プログラミング言語Python']

print(regex_obj_ascii.findall(s))

# ['Python']

s = "プログラミング言語　Python"

print(regex_obj.findall(s))

# ['プログラミング言語', 'Python']

print(regex_obj_ascii.findall(s))

# ['Python']

re.I/re.IGNORECASE

英大文字・小文字を区別せずにマッチングを行う。{A-Z]のような表現で小文字ともマッチする。現在のロケールに影響を受けず、Unicode文字に対しても動作する。

import re

s = "abc DEF"
print(re.findall(r'\b[A-Z]+\b', s))
# ['DEF']
print(re.findall(r'\b[A-Z]+\b', s, flags=re.IGNORECASE))
# ['abc', 'DEF']

s = "abc DEF ｇｅｈ ＩＪＫ"
print(re.findall(r'\b[A-ZＡ-Ｚ]+\b', s))
# ['DEF', 'ＩＪＫ']
print(re.findall(r'\b[A-ZＡ-Ｚ]+\b', s, flags=re.IGNORECASE))
# ['abc', 'DEF', 'ｇｅｈ', 'ＩＪＫ']

import re

s = "abc DEF"

print(re.findall(r'\b[A-Z]+\b', s))

# ['DEF']

print(re.findall(r'\b[A-Z]+\b', s, flags=re.IGNORECASE))

# ['abc', 'DEF']

s = "abc DEF ｇｅｈＩＪＫ"

print(re.findall(r'\b[A-ZＡ-Ｚ]+\b', s))

# ['DEF', 'ＩＪＫ']

print(re.findall(r'\b[A-ZＡ-Ｚ]+\b', s, flags=re.IGNORECASE))

# ['abc', 'DEF', 'ｇｅｈ', 'ＩＪＫ']

re.L/re.LOCALE

\b、\B、\s、\S、\w、\Wにおいて、現在のロケールに従ったマッチングを行う。バイト列でのみ意味を持つ。非推奨。

re.M/re.MULTILINE

デフォルトでは'^'は文字列全体の先頭に、'$'は文字列全体の末尾にのみマッチするが、このフラグにより、'^'は文字列の先頭と各行の先頭(各改行の直後)、'$'は文字列の末尾と各行の末尾(各改行のの直前)とマッチする。

import re

s = """Love the life you live.
Live the life you love."""

print(re.findall(r'^L\w*', s))
# ['Love']
print(re.findall(r'\w*e.$', s))
# ['love.']

print(re.findall(r'^L\w*', s, flags=re.MULTILINE))
# ['Love', 'Live']
print(re.findall(r'\w*e.$', s, flags=re.MULTILINE))
# ['live.', 'love.']

import re

s = """Love the life you live.

Live the life you love."""

print(re.findall(r'^L\w*', s))

# ['Love']

print(re.findall(r'\w*e.$', s))

# ['love.']

print(re.findall(r'^L\w*', s, flags=re.MULTILINE))

# ['Love', 'Live']

print(re.findall(r'\w*e.$', s, flags=re.MULTILINE))

# ['live.', 'love.']

re.S/re.DOTALL

デフォルトでは'.'は改行にマッチしないが、このフラグにより'.'を改行を含む任意の文字とマッチさせる。

import re

s = """Love the life you live.
Live the life you love."""

print(re.findall(r'.+', s))
['Love the life you live.', 'Live the life you love.']

print(re.findall(r'.+', s, flags=re.DOTALL))
['Love the life you live.\nLive the life you love.']

import re

s = """Love the life you live.

Live the life you love."""

print(re.findall(r'.+', s))

['Love the life you live.', 'Live the life you love.']

print(re.findall(r'.+', s, flags=re.DOTALL))

['Love the life you live.\nLive the life you love.']

re.X/re.VERBOSE

このフラグにより、パターン文字列内に改行・インデントを入れたりコメントを入れるなど、より読みやすい正規表現を書くことができる。コメントには'#'を使う。

TauStation

Python3 – 正規表現 – モジュール定数

コメントを残すコメントをキャンセル

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル