在进行url编码时,std :: regex_replace不适用于字符“ +”。 [英] While doing url encoding, the std::regex_replace doesn't work properly for character "+"
问题描述
下面是代码段,regex_replace不适用于字符 +,我不应对字符进行特殊处理,但应能正常工作。
/ *所有头文件均可用。* /
std :: string charToHex(unsigned char c,bool bUpperCase) ;
std :: string urlEncode(const std :: string& toEncode,bool bEncodeForwardSlash);
std :: string getEncodedUrl(const std :: string& url){
std :: string bktObjKey =;
std :: string urlEnc = url;
boost :: regex expression( ^(([[^:/?#] +):)?(//([^ /?#:] *)(:\\d + )?)?([[^?#] *)((\\?[^#] *))?(#(。*))?);
std :: string :: const_iterator start = url.begin(),end = url.end();
boost :: match_results< std :: string :: const_iterator>什么;
boost :: match_flag_type标志= boost :: match_default;
if(regex_search(url.begin(),url.end(),什么,表达式,标志)){
std :: cout<< Matched<< std :: endl ;
bktObjKey.insert(bktObjKey.begin(),what [6] .first,what [6] .second);
std :: regex fobj(bktObjKey);
/ * std :: string fobj = bktObjKey; * /
/ * auto pos = url.find(bktObjKey); * /
bktObjKey = urlEncode(bktObjKey,false) ;
std :: cout<< bktObjKey:<< bktObjKey.c_str()<< urlEnc:<< urlEnc.c_str()<< std :: endl;
urlEnc = std :: regex_replace(url,fobj,bktObjKey);
std :: cout<< urlEnc:<< urlEnc.c_str()<< std :: endl;
}
返回urlEnc;
}
std :: string urlEncode(const std :: string& toEncode,bool bEncodeForwardSlash){
std :: ostringstream out;
std :: cout<<内部编码<< std :: endl;
for(std :: string :: size_type i = 0; i< toEncode.length(); ++ i){
char ch = toEncode.at(i);
if(((ch> ='A'& ch< ='Z'))||
(ch> ='a'& ch< ='z ')||
(ch> ='0'&& ch< ='9')||
(ch =='_'|| ch =='-'| | | ch =='〜'|| ch =='。')||
(ch =='/'&&!bEncodeForwardSlash)){
out<< ch;
std :: cout<< out.str()<<"未编码为十六进制<<< std :: endl;
}
else {
out<< %<< charToHex(ch,true);
std :: cout<< out.str()<<"编码为十六进制<<< std :: endl;
}
}
std :: cout<<< Return:<< out.str()<< std :: endl;
return out.str();
}
std :: string charToHex(unsigned char c,bool bUpperCase){
short i = c;
std :: stringstream s;
s<< std :: setw(2)<< std :: setfill('0')<< std :: hex<<一世;
return s.str();
}
int main(){
std :: string url1 = http://10.130.0.36/rbkt10/+;
std :: string out1 = getEncodedUrl(url1);
std :: cout<<编码的URL1 =:<< out1<< std :: endl;
返回0;
}
输出:
编码的URL1 =:
#include< boost / regex.hpp>
#include< iostream>
#include< iomanip>
无效writeHex(std :: ostream& os,无符号字符c,布尔型大写){
os<< std :: setfill('0')<< std :: hex;
如果(大写)
os<< std ::大写;
os<< ‘%’<< std :: setw(2)<< static_cast< int>(c);
}
void urlEncode(std :: ostream& os,const std :: string& toEncode,bool bEncodeForwardSlash){
auto is_safe = [=](uint8_t ch) {
return std :: isalnum(ch)||
(ch ==‘/’&&!bEncodeForwardSlash)||
std :: strchr( _-〜。,ch);
};
for(char ch:toEncode){
if(is_safe(ch))
os<< ch;
else
writeHex(os,ch,true);
}
}
std :: string urlEncode(const std :: string& toEncode,bool bEncodeForwardSlash){
std :: ostringstream out;
urlEncode(out,toEncode,bEncodeForwardSlash);
return out.str();
}
std :: string getEncodedUrl(std :: string url){
boost :: regex uri_regex(
R(^(( ?< scheme> [^:/?#] +):)?)
R((??
这就是为什么您使用库的原因。
¹ (这会导致+从输入中保留。它不是重复的,它只是不被替换,因为 / +
表示1个或多个 /
)。
²请参阅 https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax
Following is the code snippet, the regex_replace dosn't work properly for character "+", I should not use special handling for the characters, but it should work properly.
/*All headerfiles are available.*/
std::string charToHex(unsigned char c, bool bUpperCase);
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash);
std::string getEncodedUrl(const std::string& url){
std::string bktObjKey = "";
std::string urlEnc = url;
boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");
std::string::const_iterator start=url.begin(), end = url.end();
boost::match_results<std::string::const_iterator> what;
boost::match_flag_type flags = boost::match_default;
if (regex_search(url.begin(), url.end(), what, expression, flags)) {
std::cout<<"Matched"<<std::endl;
bktObjKey.insert(bktObjKey.begin(), what[6].first, what[6].second);
std::regex fobj(bktObjKey);
/*std::string fobj = bktObjKey;*/
/*auto pos = url.find(bktObjKey);*/
bktObjKey = urlEncode(bktObjKey, false);
std::cout<<"bktObjKey :"<<bktObjKey.c_str()<<" urlEnc: "<<urlEnc.c_str()<<std::endl;
urlEnc = std::regex_replace(url, fobj, bktObjKey);
std::cout<<" urlEnc: "<<urlEnc.c_str()<<std::endl;
}
return urlEnc;
}
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash) {
std::ostringstream out;
std::cout<<"inside encode"<<std::endl;
for(std::string::size_type i=0; i < toEncode.length(); ++i) {
char ch = toEncode.at(i);
if ((ch >= 'A' && ch <= 'Z') ||
(ch >= 'a' && ch <= 'z') ||
(ch >= '0' && ch <= '9') ||
(ch == '_' || ch == '-' || ch == '~' || ch == '.') ||
(ch == '/' && !bEncodeForwardSlash)) {
out << ch;
std::cout<<out.str()<<" Is not coded to HEX"<<std::endl;
}
else {
out << "%" << charToHex(ch, true);
std::cout<<out.str()<<" Is coded to HEX"<<std::endl;
}
}
std::cout<<"Return :"<<out.str()<<std::endl;
return out.str();
}
std::string charToHex(unsigned char c, bool bUpperCase) {
short i = c;
std::stringstream s;
s << std::setw(2) << std::setfill('0') << std::hex << i;
return s.str();
}
int main() {
std::string url1 ="http://10.130.0.36/rbkt10/+";
std::string out1 = getEncodedUrl(url1);
std::cout<<"Encoded URL1=:"<<out1<<std::endl;
return 0;
}
OUTPUT:
Encoded URL1=:http://10.130.0.36/rbkt10/%2b+
So the out put becomes "++". It should be only "+". How can I make it work perfectly?
解决方案
You're interpreting the original string as a regex. +
is special in regex¹.
You should simply use std::string::replace
because you don't need regex replace functionality:
boost::smatch what;
if (regex_search(url.cbegin(), url.cend(), what, expression)) {
boost::ssub_match query = what[6];
url.replace(query.first, query.second, urlEncode(query.str(), false));
}
Complicated, scattered code like this:
could simply be:
std::string bktObjKey = what[6].str();
Complicated loop
for (std::string::size_type i = 0; i < toEncode.length(); ++i) {
char ch = toEncode.at(i);
Could just be
for (char ch : toEncode) {
charToHex
creates a new 2-char string everytime, using another stringstream everytime, copying the result out of the stringstream etc. Instead, just write to the stringstream you have and avoid all the inefficiency:
void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
os << std::setfill('0') << std::hex;
if (uppercase)
os << std::uppercase;
os << std::setw(2) << static_cast<int>(c);
}
Note this also fixes the fact that you forgot to use bUppercase
Look at <cctype>
for help classifying characters.
Use raw literals to write
boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");
instead as:
boost::regex expression(R"(^(([^:/?#]+):)?(//([^/?#:]*)(:\d+)?)?([^?#]*)((\?[^#]*))?(#(.*))?)");
(no need to doubly escape \d
and \?
)
Either drop all the redundant sub-groups
boost::regex expression(R"(^([^:/?#]+:)?(//[^/?#:]*(:\d+)?)?[^?#]*(\?[^#]*)?(#.*)?)");
OR make them maintainable and useful²:
boost::regex uri_regex(
R"(^((?<scheme>[^:/?#]+):)?)"
R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
R"((?<path>[^?#]*))"
R"((\?(?<query>([^#]*)))?)"
R"((#(?<fragment>.*))?)");
Now that you have access to logical components of the URI, apply it to know better when and where to encode:
std::string escaped =
what["scheme"].str() +
what["authority"].str() +
urlEncode(what["path"].str(), false);
if (query.matched) {
escaped += '?';
escaped.append(urlEncode(query, true));
}
if (fragment.matched) {
escaped += '#';
escaped.append(urlEncode(fragment, true));
}
Make an overload of urlEncode
that takes an existing ostream reference instead of always creating your own:
std::ostringstream out;
out << what["scheme"] << what["authority"];
urlEncode(out, what["path"], false);
if (query.matched)
urlEncode(out << '?', query, true);
if (fragment.matched)
urlEncode(out << '#', fragment, true);
Code After Review
#include <boost/regex.hpp>
#include <iostream>
#include <iomanip>
void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
os << std::setfill('0') << std::hex;
if (uppercase)
os << std::uppercase;
os << '%' << std::setw(2) << static_cast<int>(c);
}
void urlEncode(std::ostream& os, const std::string &toEncode, bool bEncodeForwardSlash) {
auto is_safe = [=](uint8_t ch) {
return std::isalnum(ch) ||
(ch == '/' && !bEncodeForwardSlash) ||
std::strchr("_-~.", ch);
};
for (char ch : toEncode) {
if (is_safe(ch))
os << ch;
else
writeHex(os, ch, true);
}
}
std::string urlEncode(const std::string &toEncode, bool bEncodeForwardSlash) {
std::ostringstream out;
urlEncode(out, toEncode, bEncodeForwardSlash);
return out.str();
}
std::string getEncodedUrl(std::string url) {
boost::regex uri_regex(
R"(^((?<scheme>[^:/?#]+):)?)"
R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
R"((?<path>[^?#]*))"
R"((\?(?<query>([^#]*)))?)"
R"((#(?<fragment>.*))?)");
boost::match_results<std::string::iterator> what;
//boost::smatch what;
if (regex_search(url.begin(), url.end(), what, uri_regex)) {
auto& full = what[0];
auto& query = what["query"];
auto& fragment = what["fragment"];
std::ostringstream out;
out << what["scheme"] << what["authority"];
urlEncode(out, what["path"], false);
if (query.matched)
urlEncode(out << '?', query, true);
if (fragment.matched)
urlEncode(out << '#', fragment, true);
url.replace(full.begin(), full.end(), out.str());
}
return url;
}
int main() {
for (std::string url : {
"http://10.130.0.36/rbkt10/+",
"//10.130.0.36/rbkt10/+",
"//localhost:443/rbkt10/+",
"https:/rbkt10/+",
"https:/rbkt10/+?in_params='please do escape / (forward slash)'&more#also=in/fragment",
"match inside text http://10.130.0.36/rbkt10/+ is a bit fuzzy",
}) {
std::cout << "Encoded URL: " << getEncodedUrl(url) << std::endl;
}
}
Prints
Encoded URL: http//10.130.0.36/rbkt10/%2B
Encoded URL: //10.130.0.36/rbkt10/%2B
Encoded URL: //localhost%3A443/rbkt10/%2B
Encoded URL: https/rbkt10/%2B
Encoded URL: https/rbkt10/%2B?in_params%3D%27please%20do%20escape%20%2F%20%28forward%20slash%29%27%26more#also%3Din%2Ffragment
Encoded URL: match inside text http//10.130.0.36/rbkt10/%2B%20is%20a%20bit%20fuzzy
CAUTION
Notice that the code STILL doesn't adhere to the specs:
This is why you use a library instead.
¹ (This causes + to be left from the input. It's not "repeated", it's just not replaced because /+
means 1 or more /
).
² See https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax
这篇关于在进行url编码时,std :: regex_replace不适用于字符“ +”。的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!