在进行url编码时，std :: regex_replace无法正确处理字符&"+&"字符. [英] While doing url encoding, the std::regex_replace doesn&#39;t work properly for character &quot;+&quot;

查看：66 发布时间：2021/4/20 18:31:41 c++ c++11 boost boost-regex

本文介绍了在进行url编码时，std :: regex_replace无法正确处理字符&"+&"字符.的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

下面是代码段，regex_replace不适用于字符"+"，我不应对字符进行特殊处理，但应能正常工作.

 /*所有头文件均可用.*/std :: string charToHex(unsigned char c，bool bUpperCase);std :: string urlEncode(const std :: string& toEncode，bool bEncodeForwardSlash);std :: string getEncodedUrl(const std :: string& url){std :: string bktObjKey =";std :: string urlEnc = url;boost :: regex expression("^(([[^:/?#] +):)?(//([^/?#:] *)(:\\ d +)?)?([^?#]*)((\\?[^#] *))?(#(.*))?);std :: string :: const_iterator start = url.begin()，end = url.end();boost :: match_results< std :: string :: const_iterator>什么;boost :: match_flag_type标志= boost :: match_default;如果(regex_search(url.begin()，url.end()，什么，表达式，标志)){std :: cout<匹配的"<< std :: endl;bktObjKey.insert(bktObjKey.begin()，what [6] .first，what [6] .second);std :: regex fobj(bktObjKey);/* std :: string fobj = bktObjKey; *//*自动pos = url.find(bktObjKey); */bktObjKey = urlEncode(bktObjKey，false);std :: cout<"bktObjKey:"< bktObjKey.c_str()<"urlEnc:"<< urlEnc.c_str()<< std :: endl;urlEnc = std :: regex_replace(url，fobj，bktObjKey);std :: cout<<"urlEnc:"<< urlEnc.c_str()<< std :: endl;}返回urlEnc;}std :: string urlEncode(const std :: string& toEncode，bool bEncodeForwardSlash){std :: ostringstream out;std :: cout<<内部编码"<< std :: endl;for(std :: string :: size_type i = 0; i< toEncode.length(); ++ i){char ch = toEncode.at(i);如果((ch> ='A'& ch< ='Z')||(ch> ='a'& ch< ='z')||(ch> ='0'&&ch;< ='9')||(ch =='_'|| ch =='-'|| ch =='〜'|| ch =='.')||(ch =='/'&&！bEncodeForwardSlash)){出<<ch;std :: cout<< out.str()<<未编码为十六进制"<< std :: endl;}别的 {出<<％"<<charToHex(ch，true);std :: cout<< out.str()<<编码为十六进制"<< std :: endl;}}std :: cout<<返回:"<< out.str()<< std :: endl;返回out.str();}std :: string charToHex(unsigned char c，bool bUpperCase){简而言之，我= C;std :: stringstream s;s<<std :: setw(2)<<std :: setfill('0')<<std :: hex<<一世;返回s.str();}int main(){std :: string url1 ="http://10.130.0.36/rbkt10/+";std :: string out1 = getEncodedUrl(url1);std :: cout<<已编码的URL1 =:"<< out1<< std :: endl;返回0;}

输出:编码的URL1 =:

  #include< boost/regex.hpp>#include< iostream>#include< iomanip>void writeHex(std :: ostream& os，无符号字符c，布尔型大写){os<<std :: setfill('0')<<std :: hex;如果(大写)os<<std ::大写;os<<'％'<<std :: setw(2)<<static_cast< int>(c);}void urlEncode(std :: ostream& os，const std :: string& toEncode，bool bEncodeForwardSlash){自动is_safe = [=](uint8_t ch){返回std :: isalnum(ch)||(ch =='/'&&！bEncodeForwardSlash)||std :: strchr("_-〜."，ch);};for(char ch:toEncode){如果(is_safe(ch))os<<ch;别的writeHex(os，ch，true);}}std :: string urlEncode(const std :: string& toEncode，bool bEncodeForwardSlash){std :: ostringstream out;urlEncode(out，toEncode，bEncodeForwardSlash);返回out.str();}std :: string getEncodedUrl(std :: string url){boost :: regex uri_regex(R(^((?? scheme> [^:/?#] +):)?)"R(((?< authority>//(\?< host> [^/?#:] *)(:(?(< port> \ d +))?)?)"R((?? path> [^?#] *))"R((\?(?< query>([^#] *))?)"R"(((#(α< fragment>.*))?)"))；boost :: match_results< std :: string :: iterator>什么;//boost :: smatch what;如果(regex_search(url.begin()，url.end()，什么，uri_regex)){自动&满=什么[0];自动&查询= what [查询"];自动&片段= what [片段"];std :: ostringstream out;出<<what ["scheme"]<<what ["authority"];urlEncode(out，what ["path"]，false);如果(query.matched)urlEncode(out<<'?'，query，true);如果(片段匹配)urlEncode(out<<'#'，片段，true);url.replace(full.begin()，full.end()，out.str());}返回网址；}int main(){对于(std :: string url:{"http://10.130.0.36/rbkt10/+"，"//10.130.0.36/rbkt10/+"，"//localhost:443/rbkt10/+"，"https:/rbkt10/+"，"https:/rbkt10/+?in_params ='请不要转义/(正斜杠)'& more#also = in/fragment"，文本http://10.130.0.36/rbkt10/+内的匹配有点模糊"，}){std :: cout<<编码的URL:"<<getEncodedUrl(url)<<std :: endl;}}

打印

 编码的URL:http//10.130.0.36/rbkt10/％2B编码的网址://10.130.0.36/rbkt10/%2B编码的URL://localhost％3A443/rbkt10/％2B编码网址:https/rbkt10/％2B编码的URL:https/rbkt10/％2B?in_params％3D％27please％20do％20escape％20％2F％20％28forward％20slash％29％27％26more#also％3Din％2Ffragment编码的URL:匹配文本http//10.130.0.36/rbkt10/％2B％20is％20a％20bit％20fuzzy

注意

请注意，代码STILL不符合规范:

这就是为什么您使用库的原因.

¹(这会导致+从输入中保留.它不是重复的"，只是因为/+ 表示1个或多个/而不被替换).

²请参见 https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax

Following is the code snippet, the regex_replace dosn't work properly for character "+", I should not use special handling for the characters, but it should work properly.

/*All headerfiles are available.*/



std::string charToHex(unsigned char c, bool bUpperCase);
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash);
std::string getEncodedUrl(const std::string& url){
std::string bktObjKey = "";

std::string urlEnc = url;

boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");

std::string::const_iterator start=url.begin(), end = url.end();
boost::match_results<std::string::const_iterator> what;
boost::match_flag_type flags = boost::match_default;
if (regex_search(url.begin(), url.end(), what, expression, flags)) {
  std::cout<<"Matched"<<std::endl;
  bktObjKey.insert(bktObjKey.begin(), what[6].first, what[6].second);

  std::regex fobj(bktObjKey);
  /*std::string fobj = bktObjKey;*/

  /*auto pos = url.find(bktObjKey);*/
  bktObjKey = urlEncode(bktObjKey, false);
  std::cout<<"bktObjKey :"<<bktObjKey.c_str()<<" urlEnc: "<<urlEnc.c_str()<<std::endl;

  urlEnc = std::regex_replace(url, fobj, bktObjKey);
  std::cout<<" urlEnc: "<<urlEnc.c_str()<<std::endl;
}
  return urlEnc;
}
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash)  {
  std::ostringstream out;

  std::cout<<"inside encode"<<std::endl;
  for(std::string::size_type i=0; i < toEncode.length(); ++i) {
    char ch = toEncode.at(i);
    if ((ch >= 'A' && ch <= 'Z') ||
        (ch >= 'a' && ch <= 'z') ||
        (ch >= '0' && ch <= '9') ||
        (ch == '_' || ch == '-' || ch == '~' || ch == '.') ||
        (ch == '/' && !bEncodeForwardSlash)) {
      out << ch;
      std::cout<<out.str()<<" Is not coded to HEX"<<std::endl;
    }
    else {
      out << "%" <<  charToHex(ch, true);
      std::cout<<out.str()<<" Is coded to HEX"<<std::endl;
    }
  }
  std::cout<<"Return :"<<out.str()<<std::endl;
  return out.str();
}

std::string charToHex(unsigned char c, bool bUpperCase) {
  short i = c;
  std::stringstream s;
  s << std::setw(2) << std::setfill('0') << std::hex << i;
  return s.str();
}

int main() {

std::string url1 ="http://10.130.0.36/rbkt10/+";
std::string out1 = getEncodedUrl(url1);
std::cout<<"Encoded URL1=:"<<out1<<std::endl;

return 0;
}

OUTPUT: Encoded URL1=:http://10.130.0.36/rbkt10/%2b+

So the out put becomes "++". It should be only "+". How can I make it work perfectly?

解决方案

You're interpreting the original string as a regex. + is special in regex¹.

You should simply use std::string::replace because you don't need regex replace functionality:

boost::smatch what;
if (regex_search(url.cbegin(), url.cend(), what, expression)) {
    boost::ssub_match query = what[6];
    url.replace(query.first, query.second, urlEncode(query.str(), false));
}

Complicated, scattered code like this:
could simply be:
```
std::string bktObjKey = what[6].str();
```

Complicated loop

for (std::string::size_type i = 0; i < toEncode.length(); ++i) {
     char ch = toEncode.at(i);

Could just be

for (char ch : toEncode) {

charToHex creates a new 2-char string everytime, using another stringstream everytime, copying the result out of the stringstream etc. Instead, just write to the stringstream you have and avoid all the inefficiency:
```
void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
    os << std::setfill('0') << std::hex;
    if (uppercase) 
        os << std::uppercase;
    os << std::setw(2) << static_cast<int>(c);
}
```
Note this also fixes the fact that you forgot to use bUppercase
Look at <cctype> for help classifying characters.

Use raw literals to write

boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");

instead as:

boost::regex expression(R"(^(([^:/?#]+):)?(//([^/?#:]*)(:\d+)?)?([^?#]*)((\?[^#]*))?(#(.*))?)");

(no need to doubly escape \d and \?)

Either drop all the redundant sub-groups

boost::regex expression(R"(^([^:/?#]+:)?(//[^/?#:]*(:\d+)?)?[^?#]*(\?[^#]*)?(#.*)?)");

OR make them maintainable and useful²:

boost::regex uri_regex(
    R"(^((?<scheme>[^:/?#]+):)?)"
    R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
    R"((?<path>[^?#]*))"
    R"((\?(?<query>([^#]*)))?)"
    R"((#(?<fragment>.*))?)");

Now that you have access to logical components of the URI, apply it to know better when and where to encode:

    std::string escaped = 
       what["scheme"].str() + 
       what["authority"].str() +
       urlEncode(what["path"].str(), false);

    if (query.matched) {
        escaped += '?';
        escaped.append(urlEncode(query, true));
    }

    if (fragment.matched) {
        escaped += '#';
        escaped.append(urlEncode(fragment, true));
    }

Make an overload of urlEncode that takes an existing ostream reference instead of always creating your own:

std::ostringstream out;
out << what["scheme"] << what["authority"];
urlEncode(out, what["path"], false);

if (query.matched)
    urlEncode(out << '?', query, true);

if (fragment.matched)
    urlEncode(out << '#', fragment, true);

Code After Review

Live On Coliru

#include <boost/regex.hpp>
#include <iostream>
#include <iomanip>

void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
    os << std::setfill('0') << std::hex;
    if (uppercase) 
        os << std::uppercase;
    os << '%' << std::setw(2) << static_cast<int>(c);
}

void urlEncode(std::ostream& os, const std::string &toEncode, bool bEncodeForwardSlash) {
    auto is_safe = [=](uint8_t ch) {
        return std::isalnum(ch) ||
            (ch == '/' && !bEncodeForwardSlash) ||
            std::strchr("_-~.", ch);
    };

    for (char ch : toEncode) {
        if (is_safe(ch))
            os << ch;
        else
            writeHex(os, ch, true);
    }
}

std::string urlEncode(const std::string &toEncode, bool bEncodeForwardSlash) {
    std::ostringstream out;
    urlEncode(out, toEncode, bEncodeForwardSlash);
    return out.str();
}

std::string getEncodedUrl(std::string url) {

    boost::regex uri_regex(
        R"(^((?<scheme>[^:/?#]+):)?)"
        R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
        R"((?<path>[^?#]*))"
        R"((\?(?<query>([^#]*)))?)"
        R"((#(?<fragment>.*))?)");

    boost::match_results<std::string::iterator> what;
    //boost::smatch what;
    if (regex_search(url.begin(), url.end(), what, uri_regex)) {
        auto& full     = what[0];
        auto& query    = what["query"];
        auto& fragment = what["fragment"];

        std::ostringstream out;
        out << what["scheme"] << what["authority"];
        urlEncode(out, what["path"], false);

        if (query.matched)
            urlEncode(out << '?', query, true);

        if (fragment.matched)
            urlEncode(out << '#', fragment, true);

        url.replace(full.begin(), full.end(), out.str());
    }
    return url;
}

int main() {
    for (std::string url : { 
            "http://10.130.0.36/rbkt10/+",
            "//10.130.0.36/rbkt10/+",
            "//localhost:443/rbkt10/+",
            "https:/rbkt10/+",
            "https:/rbkt10/+?in_params='please do escape / (forward slash)'&more#also=in/fragment",
            "match inside text http://10.130.0.36/rbkt10/+ is a bit fuzzy",
          }) {
        std::cout << "Encoded URL: " << getEncodedUrl(url) << std::endl;
    }
}

Prints

Encoded URL: http//10.130.0.36/rbkt10/%2B
Encoded URL: //10.130.0.36/rbkt10/%2B
Encoded URL: //localhost%3A443/rbkt10/%2B
Encoded URL: https/rbkt10/%2B
Encoded URL: https/rbkt10/%2B?in_params%3D%27please%20do%20escape%20%2F%20%28forward%20slash%29%27%26more#also%3Din%2Ffragment
Encoded URL: match inside text http//10.130.0.36/rbkt10/%2B%20is%20a%20bit%20fuzzy

CAUTION

Notice that the code STILL doesn't adhere to the specs:

This is why you use a library instead.

¹ (This causes + to be left from the input. It's not "repeated", it's just not replaced because /+ means 1 or more /).

² See https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax

这篇关于在进行url编码时，std :: regex_replace无法正确处理字符&"+&"字符.的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

在进行url编码时，std :: regex_replace无法正确处理字符&"+&"字符. [英] While doing url encoding, the std::regex_replace doesn&#39;t work properly for character &quot;+&quot;

问题描述

注意

Code After Review

CAUTION

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

在进行url编码时，std :: regex_replace无法正确处理字符&amp;"+&amp;"字符. [英] While doing url encoding, the std::regex_replace doesn&amp;#39;t work properly for character &amp;quot;+&amp;quot;

问题描述

注意

Code After Review

CAUTION

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

在进行url编码时，std :: regex_replace无法正确处理字符&"+&"字符. [英] While doing url encoding, the std::regex_replace doesn't work properly for character "+"

登录关闭