Python3 – 正規表現 – モジュール関数

1 概要
2 正規表現のコンパイル
- 2.1 compile()
3 パターンの検索
4 分割
- 4.1 split()
5 置換
- 5.1 sub()
- 5.2 subn()

概要

reモジュールの関数は、パターンと文字列を直接指定してマッチングなどの操作を行う。

引数の中のflagsについては、reモジュールで定義された定数を指定する。複数のflagsを指定する場合は、ビットごとのOR('|'演算子)を使って組み合わせる。

パターンは実行に先立ってコンパイルされるので、同じパターンを複数回用いる場合には、re.compile()関数でパターンをコンパイルし、コンパイルされたRegexObjectオブジェクトのメソッドを用いる方がよい。

正規表現のコンパイル

compile()

re.compile(pattern, flags=0)

正規表現パターンを正規表現オブジェクトにコンパイルする。正規表現オブジェクトのメソッド群で、以下のモジュール関数と同等の操作を行うことができる。

以下のモジュール関数を使う場合はflagsを関数ごとに指定するが、正規表現オブジェクトを使う場合は、compile()関数の引数でflagsを指定する。

パターンの検索

search()

re.search(pattern, string, flags=0)

stringの任意の位置で、最初にpatternにマッチした時にMatchObjectのオブジェクトを返す。マッチしなければNoneを返す。

import re
s = r"abcdabcd"
print(re.search(r'ab', s))
# <_sre.SRE_Match object; span=(0, 2), match='ab'>
print(re.search(r'cd', s))
# <_sre.SRE_Match object; span=(2, 4), match='cd'>
print(re.search(r'ef', s))
# None

import re

s = r"abcdabcd"

print(re.search(r'ab', s))

# <_sre.SRE_Match object; span=(0, 2), match='ab'>

print(re.search(r'cd', s))

# <_sre.SRE_Match object; span=(2, 4), match='cd'>

print(re.search(r'ef', s))

# None

match()

re.match(pattern, string, flags=0)

stringの先頭でpatternにマッチすればMatchObjectのオブジェクトを返す。マッチしなければNoneを返す。stringの途中ではマッチしない。

import re
s = r"abcdabcd"
print(re.match(r'ab', s))
# <_sre.SRE_Match object; span=(0, 2), match='ab'>
print(re.match(r'cd', s))
# None

import re

s = r"abcdabcd"

print(re.match(r'ab', s))

# <_sre.SRE_Match object; span=(0, 2), match='ab'>

print(re.match(r'cd', s))

# None

fullmatch()

re.fullmatch(pattern, string, flags=0)

patternがstring全体にマッチしたときだけMatcObjectのオブジェクトを返し、それ以外はNoneを返す。

import re
s = r"abcdabcd"
print(re.fullmatch(r'abcdabcd', s))
# <_sre.SRE_Match object; span=(0, 8), match='abcdabcd'>
print(re.fullmatch(r'abcd', s))
# None
print(re.fullmatch(r'.*', s))
# <_sre.SRE_Match object; span=(0, 8), match='abcdabcd'>
print(re.fullmatch(r'.{5}', s))
# None

import re

s = r"abcdabcd"

print(re.fullmatch(r'abcdabcd', s))

# <_sre.SRE_Match object; span=(0, 8), match='abcdabcd'>

print(re.fullmatch(r'abcd', s))

# None

print(re.fullmatch(r'.*', s))

# <_sre.SRE_Match object; span=(0, 8), match='abcdabcd'>

print(re.fullmatch(r'.{5}', s))

# None

findall()

re.findall(pattern, string, flags=0)

string中でpatternにマッチする全ての部分文字列を要素とするリストを返す。マッチする部分がなければ空のリスト([])を返す。先頭からマッチした部分を取り除きながらサーチしていく。

import re
s = r"abcdabcd"
print(re.findall(r'ab', s))
# ['ab', 'ab']
print(re.findall(r'ef', s))
# []
print(re.findall(r'.{3}', s))
# ['abc', 'dab']

import re

s = r"abcdabcd"

print(re.findall(r'ab', s))

# ['ab', 'ab']

print(re.findall(r'ef', s))

# []

print(re.findall(r'.{3}', s))

# ['abc', 'dab']

finditer()

re.finditer(pattern, string, flags=0)

string中でpatternにマッチした結果のMatchObjectオブジェクトのイテレータを返す。マッチする部分がなければ空のイテレータを返す。先頭からマッチした部分を取り除きながらサーチしていく。

import re
s = r"abcdabcd"
[print(x) for x in re.finditer(r'ab', s)]
# <_sre.SRE_Match object; span=(0, 2), match='ab'>
# <_sre.SRE_Match object; span=(4, 6), match='ab'>
[print(x) for x in re.finditer(r'ef', s)]
# 空のイテレータ
[print(x) for x in re.finditer(r'.{3}', s)]
# <_sre.SRE_Match object; span=(0, 3), match='abc'>
# <_sre.SRE_Match object; span=(3, 6), match='dab'>

import re

s = r"abcdabcd"

[print(x) for x in re.finditer(r'ab', s)]

# <_sre.SRE_Match object; span=(0, 2), match='ab'>

# <_sre.SRE_Match object; span=(4, 6), match='ab'>

[print(x) for x in re.finditer(r'ef', s)]

# 空のイテレータ

[print(x) for x in re.finditer(r'.{3}', s)]

# <_sre.SRE_Match object; span=(0, 3), match='abc'>

# <_sre.SRE_Match object; span=(3, 6), match='dab'>

分割

split()

re.split(pattern, string, maxsplit=0, flags=0)

stringをすべてのpatternにマッチする部分で分割し、それらを要素とするリストを返す。patternにマッチする部分は除かれる。マッチする部分がなければ、stringを1つの要素とするリストが返される。

maxsplitに1以上の数nを指定すると、先頭から最大n個の分割が発生し、残りはリストの最後の要素となる。

import re
s =r"she sells sea shells by the seashore"
print(re.split(r' ', s))
# ['she', 'sells', 'sea', 'shells', 'by', 'the', 'seashore']
print(re.split(r'se', s))
# ['she ', 'lls ', 'a shells by the ', 'ashore']
print(re.split(r'xyz', s))
# ['she sells sea shells by the seashore']
print(re.split(r' ', s, maxsplit=3))
# ['she', 'sells', 'sea', 'shells by the seashore']
print(re.split(r' ', s, maxsplit=10))
# ['she', 'sells', 'sea', 'shells', 'by', 'the', 'seashore']

import re

s =r"she sells sea shells by the seashore"

print(re.split(r' ', s))

# ['she', 'sells', 'sea', 'shells', 'by', 'the', 'seashore']

print(re.split(r'se', s))

# ['she ', 'lls ', 'a shells by the ', 'ashore']

print(re.split(r'xyz', s))

# ['she sells sea shells by the seashore']

print(re.split(r' ', s, maxsplit=3))

# ['she', 'sells', 'sea', 'shells by the seashore']

print(re.split(r' ', s, maxsplit=10))

# ['she', 'sells', 'sea', 'shells', 'by', 'the', 'seashore']

patternが先頭の部分に一致する場合は、リストの最初は空文字列から始まり、最後の部分に一致する場合はリストの最終要素が空文字列になる。

import re
s = "abracadabra"
print(re.split(r'ab', s))
# ['', 'racad', 'ra']
print(re.split(r'ra', s))
# ['ab', 'cadab', '']

import re

s = "abracadabra"

print(re.split(r'ab', s))

# ['', 'racad', 'ra']

print(re.split(r'ra', s))

# ['ab', 'cadab', '']

空文字列('')では分割されない。patternとして空文字1文字を指定するとエラー。

置換

sub()

re.sub(pattern, repl, string, count=0, flags=0)

string中でpatternにマッチする部分文字列をreplの文字列で置換する。countで正の整数値を指定すると、先頭から最大その回数だけ置換を行う。

import re
s = "she sells sea shells by the seashore"
print(re.sub(r'sh', "++", s))
# ++e sells sea ++ells by the sea++ore
print(re.sub(r'sh', "++", s, count=2))
# ++e sells sea ++ells by the seashore

import re

s = "she sells sea shells by the seashore"

print(re.sub(r'sh', "++", s))

# ++e sells sea ++ells by the sea++ore

print(re.sub(r'sh', "++", s, count=2))

# ++e sells sea ++ells by the seashore

空文字列とのマッチは、前のマッチの直後以外に置換される。

import re
print(re.sub(r'x*', '-', "abcde"))
# -a-b-c-d-e-

import re

print(re.sub(r'x*', '-', "abcde"))

# -a-b-c-d-e-

replには文字列を返す関数を指定できる。

import re

def repl_func(matchobj):
    if matchobj.group(0).isalpha():
        return 'A'
    else:
        return '0'

print(re.sub(r'[A-Z0-9]', repl_func, "TK-80-BS"))
# AA-00-AA

import re

def repl_func(matchobj):

if matchobj.group(0).isalpha():

return 'A'

else:

return '0'

print(re.sub(r'[A-Z0-9]', repl_func, "TK-80-BS"))

# AA-00-AA

subn()

re.subn(pattern, repl, string, count=0, fkags=0)

sub()と同じ操作を行うが、タプルで(置換後の文字列, 置換数)を返す。

import re
s = "she sells sea shells by the seashore"
print(re.subn(r'sh', "++", s))
# ('++e sells sea ++ells by the sea++ore', 3)
print(re.subn(r'sh', "++", s, count=2))
# ('++e sells sea ++ells by the seashore', 2)

import re

s = "she sells sea shells by the seashore"

print(re.subn(r'sh', "++", s))

# ('++e sells sea ++ells by the sea++ore', 3)

print(re.subn(r'sh', "++", s, count=2))

# ('++e sells sea ++ells by the seashore', 2)

TauStation

Python3 – 正規表現 – モジュール関数

概要

正規表現のコンパイル

compile()

パターンの検索

search()

match()

fullmatch()

findall()

finditer()

分割

split()

置換

sub()

subn()

コメントを残すコメントをキャンセル

概要

正規表現のコンパイル

compile()

パターンの検索

search()

match()

fullmatch()

findall()

finditer()

分割

split()

置換

sub()

subn()

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル