【C++】regex 正则表达式

正则表达式是一种描述字符序列的方法，是C++11标准库中新加入的强大工具。正则表达式是一种用于字符串处理的微型语言，适用于一些与字符串相关的操作。C++11包含了对以下几种语法的支持：ECMAScript、basic、extended、awk、grep和egrep。C++11中使用的默认语法是ECMAScript。

RE库定义在头文件regex中，它包含多个组件：

匹配

regex_match：regex_match()算法可以用于比较一个给定源字符串和一个正则表达式模式，如果模式匹配整个源字符串，则返回true，否则返回false。

#include <iostream>
#include <regex>
using namespace std;

int main() 
{
	string str = "twinkle1993";

	regex r("[a-z0-9]+");
	cout << "正则表达式：[a-z0-9]+" << endl;
	if (regex_match(str, r))
		cout << "字符串：" << str << " 匹配成功！" << endl;
	else
		cout << "字符串：" << str << " 匹配失败！" << endl;

	cout << endl << "正则表达式：\\d+" << endl;
	if (regex_match(str, regex("\\d+")))
		cout << "字符串：" << str << " 匹配成功！" << endl;
	else
		cout << "字符串：" << str << " 匹配失败！" << endl;

	cout << endl << "正则表达式：\\d+" << endl;
	if (regex_match(str.begin() + 7, str.end(), regex("\\d+")))
		cout << "字符串：" << &str[7] << " 匹配成功！" << endl;
	else
		cout << "字符串：" << &str[7] << " 匹配失败！" << endl;

	smatch sm;
	cout << endl << "正则表达式：([a-z]+)(\\d+)" << endl;
	if (regex_match(str.cbegin() + 5, str.cend(), sm, regex("([a-z]+)(\\d+)"))) 
	{
		cout << "字符串：" << &str[5] << " 匹配成功！" << endl;
		cout << "匹配字符串个数：" << sm.size() << endl;
		cout << "分别为：";
		for (auto aa : sm)
			cout << aa.str() << " ";
		cout << endl;
	}
	else
		cout << "字符串：" << &str[5] << " 匹配失败！" << endl;

	cmatch cm;
	cout << endl << "正则表达式：([a-z]+)(\\d+)" << endl;
	if (regex_match(str.c_str(), cm, regex("([a-z]+)(\\d+)"))) 
	{
		cout << "字符串：" << str << " 匹配成功！" << endl;
		cout << "匹配字符串个数：" << cm.size() << endl;
		cout << "分别为：";
		for (auto aa : cm)
			cout << aa.str() << " ";
		cout << endl;
	}
	else
		cout << "字符串：" << str << " 匹配失败！" << endl;
	return 0;
}

运行结果：

正则表达式：[a-z0-9]+
字符串：twinkle1993 匹配成功！

正则表达式：\d+
字符串：twinkle1993 匹配失败！

正则表达式：\d+
字符串：1993 匹配成功！

正则表达式：([a-z]+)(\d+)
字符串：le1993 匹配成功！
匹配字符串个数：3
分别为：le1993 le 1993

正则表达式：([a-z]+)(\d+)
字符串：twinkle1993 匹配成功！
匹配字符串个数：3
分别为：twinkle1993 twinkle 1993

查找

regex_search：regex_search()算法可以在输入字符串中提取匹配的子字符串。smatch对象sm将包含搜索结果。如果要获得第一个捕捉组的字符串表达形式，可在代码中编写m[1]或m[1].str()。通过查看m[1].first和m[1].second迭代器可以得到这个子字符串在源字符串中出现的准确位置。

#include <iostream>
#include <regex>
using namespace std;

int main() 
{
	string str = "twinkle1993winkle1993inkle1993";
	smatch sm;

	cout << "正则表达式：([a-z]+)1" << endl;
	for (auto it = str.cbegin(); regex_search(it, str.cend(), sm, regex("([a-z]+)1")); it = sm.suffix().first) 
	{
		cout << "字符串：" << &*it << " 匹配成功！" << endl;
		cout << "匹配字符子串个数：" << sm.size() << endl;
		cout << "分别为：";
		for (auto aa : sm)
			cout << aa.str() << " ";

		cout << endl;
		cout << "字符串 " << sm.str() << " 前的字符串为：" << sm.prefix().str() << endl;
		cout << "字符串 " << sm.str() << " 后的字符串为：" << sm.suffix().str() << endl;
		cout << endl;
	}
	return 0;
}

运行结果：

正则表达式：([a-z]+)1
字符串：twinkle1993winkle1993inkle1993 匹配成功！
匹配字符子串个数：2
分别为：twinkle1 twinkle
字符串 twinkle1 前的字符串为：
字符串 twinkle1 后的字符串为：993winkle1993inkle1993

字符串：993winkle1993inkle1993 匹配成功！
匹配字符子串个数：2
分别为：winkle1 winkle
字符串 winkle1 前的字符串为：993
字符串 winkle1 后的字符串为：993inkle1993

字符串：993inkle1993 匹配成功！
匹配字符子串个数：2
分别为：inkle1 inkle
字符串 inkle1 前的字符串为：993
字符串 inkle1 后的字符串为：993

regex_iterator

为了逐一迭代正则查找的所有匹配成果，我们也可以使用regex_iterator。一般情况下，需要为某个特定的容器指定一个尾迭代器，但是对于regex_iterator，只有一个end值。只需要通过默认的构造函数声明一个regex_iterator类型，就可以获得这个尾迭代器：这个尾迭代器会被隐式地初始化为end值。

#include <iostream>
#include <regex>
using namespace std;

int main() 
{
	string str = "twinkle1993twink1993le1993";
	regex reg("([a-z]+)1");

	cout << "正则表达式：([a-z]+)1" << endl;
	for (sregex_iterator it(str.begin(), str.end(), reg), end; it != end; it++) 
	{
		cout << "字符串：" << &*it->prefix().first << " 匹配成功！" << endl;
		cout << "匹配字符子串个数：" << it->size() << endl;
		cout << "分别为：";
		for (auto aa : *it)
			cout << aa.str() << " ";
		cout << endl;
		cout << "字符串 " << it->str() << " 前的字符串为：" << it->prefix().str() << endl;
		cout << "字符串 " << it->str() << " 后的字符串为：" << it->suffix().str() << endl;
		cout << endl;
	}
	return 0;
}

运行结果：

正则表达式：([a-z]+)1
字符串：twinkle1993twink1993le1993 匹配成功！
匹配字符子串个数：2
分别为：twinkle1 twinkle
字符串 twinkle1 前的字符串为：
字符串 twinkle1 后的字符串为：993twink1993le1993

字符串：993twink1993le1993 匹配成功！
匹配字符子串个数：2
分别为：twink1 twink
字符串 twink1 前的字符串为：993
字符串 twink1 后的字符串为：993le1993

字符串：993le1993 匹配成功！
匹配字符子串个数：2
分别为：le1 le
字符串 le1 前的字符串为：993
字符串 le1 后的字符串为：993

regex_token_iterator

regex_iterator有助于迭代“匹配合格”的子序列。然而有时候你会想处理那些子序列之间的内容，特别是当你打算将string拆分为一个个语汇单元token或以某个东西分割string，分隔符甚至可能被指定为一个正则表达式。regex_token_iterator就提供了这样的功能。

为了将它初始化，需要传给它字符序列的起点和终点，以及一个正则表达式。此外还可以指明一列整数值，用来表示语汇化过程中的元素：
* -1：表示你对每一个“匹配之正则表达式之间”或“语汇切分器之间”的子序列感兴趣
* 0：表示你对每一个匹配之正则表达式或语汇切分器感兴趣
* 任何其他数字nn：表示你对正则表达式中的第nn个匹配次表达式感兴趣

扫描二维码关注公众号，回复： 3928323 查看本文章

#include <iostream>
#include <regex>
using namespace std;

int main() 
{
	string str = "11twinkle1993teink1992le1994";
	regex reg("([a-z]+)1");

	cout << "正则表达式：([a-z]+)1" << endl;
	cout << "字符串为：" << str << endl;
	for (sregex_token_iterator it(str.begin(), str.end(), reg), end; it != end; it++) 
	{
		cout << "匹配到的字符串为：" << it->str() << endl;
	}
	cout << endl;

	for (sregex_token_iterator it(str.begin(), str.end(), reg, 1), end; it != end; it++) 
	{
		cout << "匹配到的字符串为：" << it->str() << endl;
	}
	cout << endl;

	for (sregex_token_iterator it(str.begin(), str.end(), reg, -1), end; it != end; it++) 
	{
		cout << "匹配到的字符串为：" << it->str() << endl;
	}
	cout << endl;
	return 0;
}

运行结果：

正则表达式：([a-z]+)1
字符串为：11twinkle1993teink1992le1994
匹配到的字符串为：twinkle1
匹配到的字符串为：teink1
匹配到的字符串为：le1

匹配到的字符串为：twinkle
匹配到的字符串为：teink
匹配到的字符串为：le

匹配到的字符串为：11
匹配到的字符串为：993
匹配到的字符串为：992
匹配到的字符串为：994

替换

regex_replace：regex_replace()算法要求输入一个正则表达式，以及一个用于替换匹配子字符串的格式化字符串。这个格式化字符串可以通过转义序列引用匹配子字符串中的部分内容。

C++ STL之正则表达式

【正则表达式1】C++11正则表达式