如何从输入字符串中提取子字符串? [英] How to extract substrings from input strings ?

查看:99
本文介绍了如何从输入字符串中提取子字符串?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有许多具有以下结构的输入字符串,并希望从中提取一些值:

 inputStr [i] =   [Rx] [MAX COV = y] A:DEL(A,n(m,0)),LB(A,sb,lb),lOC( A,k,s,l)。; 


每个输入字符串的
,LB(...)和LOC(...)的数量子串是可变的(> = 0)和无序的,但DEL(...)子串是1。

如何从每个输入字符串中提取x,y,m,sb,lb,k,s,l变量?

 inputStr [  1 ] =   [R1] [MAX COV = 2 ] A:DEL(A,n(2,0)),LB(A,sb1,lb1),10C(A,k1,s1,l1)。; 
inputStr [ 2 ] = [R5 ] [MAX COV = 4] A:LB(A,sb2,lb2),10C(A,k2,s2,l2),DEL(A,n(3,0)),LB(A,sb3,lb3)。 ;
inputStr [ 3 ] = [R12 ] [MAX COV = 2] A:DEL(A,n(1,0)),LB(A,sb3,lb3),10C(A,k3,s3,l3),10C(A,k3,s3,l3) )




for int i = 0 ; i< inputStr.lenght; i ++)
{
/// //////如何从inputStr [i]中提取... ?????
int R = x;
int COV = y;
int M = m;
string LB + = sb lb + ;
string LOC + = ksl + ;
/// /
/// / processing ...
/// //
}





请参阅inputStr [2]的示例:

  int  R =  5 ; 
int COV = 4 ;
int M = 3 ;
string LB = sb2 lb2 sb3 lb3 ;
string LOC = k2 s2 l2 ;

解决方案

使用正则表达式可以解决这个问题。

表达式是有点复杂,但以下代码非常简单。



为了能够理解和改变未来的正则表达式,我建议研究这个主题还可以下载一个可以用来测试表达式的工具。



我通常推荐这个网站学习正则表达式:正则表达式信息/ [ ^ ]

我自己使用这个工具:正则表达式测试 [ ^ ]



现在为解决方案

我选择创建两个表达式。



第一个表达式将解析整个字符串,挑出x和y,但会将字符串的结尾视为一个子字符串。

[R x ] [MAX COV = y ] A: DEL(A,n(m,0)),LB(A,sb,lb),10C(A,k,s,l)

 ^ \ [R(?< x> [0-9] +)\] \ [MAX COV =(?< y> [0-9] +) \] A:\< / y>< / x> 







第二个表达式将使用结束部分, DEL (A,n( m ,0)),LB(A, sb lb ),LOC(A, k s l ),作为输入并提取不同的子字符串

((?=(?< name> DEL)\(A,n \((?< m> [0-9] +),0 \\ \\)\))|(=(小于?名称> LB)\(A,SB(小于SB> [0-9] +),磅(小于磅> [0-9]? +)\))|(=(小于????名称> LOC)\(A,K(小于K> [0-9] +),S(小于S取代; [0-9] +),l(?< 1> [0-9] +)\)))(,|。)





他们俩表达式使用命名组,以便以后在代码中轻松提取变量。



为了更容易处理各种变量,我创建了一些结构

  public   struct  DEL 
{
public DEL( int m)
{
.m = m;
}

public int m {获得; set ; }
}

public struct LB
{
public LB( int lb, int sb)
{
this .lb = lb;
this .sb = sb;
}

public int lb {获得; set ; }
public int sb { get ; set ; }
}

public struct LOC
{
public LOC( int k, int s, int l)
{
this .k = k ;
.s = s;
this .l = l;
}

public int k {获得; set ; }
public int s { get ; set ; }
public int l { get ; set ; }
}

public class ExtractedVariables
{
public ExtractedVariables()
{
x = 0 ;
y = 0 ;
del = new DEL(); // 只能有一个
lbList = new List< LB>();
locList = new List< LOC>();
}

public int x {获得; set ;}
public int y {获取; set ; }

public DEL del { get ; set ; }

public 列表< LB> lbList { get ; set ; }

public 列表< LOC> locList { get ; set ; }
}





我声明为静态成员变量的正则表达式

  private   static 正则表达式stringExpression =  new  Regex(  ^ \\ [R(?< x> [0-9] +)\\ \\\\ [MAX COV =(?< y> [0-9] +)\\] A:\\\\(?< vars> [\\\ \\\s] +)

);

private static Regex variableExpression = new 正则表达式( ((?=(?< name> DEL)\\( A,n\\((小于M> [0-9] +),0\\?)\\))|(=(小于?名称> LB)\\ (A,SB(小于SB> [0-9] +),磅(小于磅>有[0-9] +)\\))|(=(小于?名称> LOC) \\(A,K(小于?K> [0-9] +),S(小于S取代; [0-9] +),L(小于?L> [0-9] + )\\)))(,|。));





然后一个小方法处理一个当时的字符串

  public  ExtractedVariables ExctractVariables( string 输入)
{
ExtractedVariables result = new ExtractedVariables();

匹配m = stringExpression.Match(输入);
if (m.Success)
{
result.x = int .Parse(m.Groups [ x]。Value);
result.y = int .Parse(m.Groups [ Y]值)。

string variables = m.Groups [ 瓦尔]值。

foreach (匹配mSub in variableExpression.Matches(variables))
{
string name = mSub.Groups [ ]值。
switch (name)
{
case DEL
result.del = new DEL( int .Parse(mSub.Groups [ ]值))。
break ;

case LB
result.lbList.Add( new LB( int .Parse( mSub.Groups [ lb]。值), int .Parse(mSub.Groups [ sb]。Value)));
break ;

case LOC
result.locList.Add( new LOC( int .Parse( mSub.Groups [ k]。值), int .Parse(mSub.Groups [ s]。值), int .Parse(mSub.Groups [ l]值)));
break ;
默认 throw new 异常( string .Format( 未知的子字符串名称' {0}'。,name));
}
}
}

返回结果;
}





最后一些测试代码

  string  [] inputStr =  new   string  [  3 ]; 
inputStr [ 0 ] = [R1 ] [MAX COV = 2] A:DEL(A,n(2,0)),LB(A,sb1,lb1),LOC(A,k1,s1,l1)。;
inputStr [ 1 ] = [R5 ] [MAX COV = 4] A:LB(A,sb2,lb2),LOC(A,k2,s2,l2),DEL(A,n(3,0)),LB(A,sb3,lb3)。 ;
inputStr [ 2 ] = [R12 ] [MAX COV = 2] A:DEL(A,n(1,0)),LB(A,sb3,lb3),LOC(A,k3,s3,l3),lOC(A,k3,s3,l3) )
foreach string s in inputStr)
{
ExtractedVariables result = ExctractVariables(s);
Debug.WriteLine( x = {0},result.x);
Debug.WriteLine( y = {0},result.y);
Debug.WriteLine( \tm = {0},result.del。米);

foreach (LB lb in result.lbList)
{
Debug.WriteLine( \ tlb = {0} \tsb = {1},lb.lb,lb.sb);
}

foreach (LOC loc in result.locList)
{
Debug.WriteLine( \ tk = {0} \ts = { 1} \tl = {2},loc.k,loc.s,loc.l);
}
}





[更新]

提取sb< x>和lb< x>作为字符串,只需将lb和sb移动到相应的组中。

  private   static  Regex variableExpression =  new 正则表达式( ((=(小于?名称> DEL)\\(A,n\\((小于?M> [0-9] +),0\\)\\ \\\))|?(=(小于????名称> LB)\\(A,(小于SB> SB [0-9] +),(小于磅>磅[0-9 ] +)\\))|(=(小于????名称> LOC)\\(A,K(小于K> [0-9] +),S(小于S取代; [0-9] +),L(小于L个;。[0-9] +)\\)))(,|)); 





当然,你需要更改结构

  public   struct  LB 
{
public LB( string lb, string sb)
{
这个 .lb = lb;
this .sb = sb;
}

public string lb {获得; set ; }
public string sb { get ; set ; }
}



和LB案例

  case    LB
result.lbList.Add( new LB(mSub.Groups [ lb ] .Value,mSub.Groups [ sb]。Value));
break ;


解析你的例子并非易事。阅读本书。



https:// en.wikipedia.org/wiki/Compilers:_Principles,_Techniques,_and_Tools [ ^ ]



为了简单起见,请专注于编写左匹配的递归下降解析器。


I have many input strings with following structure and want to extract some values from it:

inputStr[i]="[Rx][MAX COV=y]A: DEL(A,n(m,0)),LB(A,sb,lb),lOC(A,k,s,l).";


for each input string,number of LB(...) and LOC(...) substrings are variable(>=0) and disordered but DEL(...) substring is one.
How to extract x,y,m,sb,lb,k,s,l variables from each input string?

inputStr[1]="[R1][MAX COV=2]A: DEL(A,n(2,0)),LB(A,sb1,lb1),lOC(A,k1,s1,l1).";
inputStr[2]="[R5][MAX COV=4]A: LB(A,sb2,lb2),lOC(A,k2,s2,l2),DEL(A,n(3,0)),LB(A,sb3,lb3).";
inputStr[3]="[R12][MAX COV=2]A: DEL(A,n(1,0)),LB(A,sb3,lb3),lOC(A,k3,s3,l3),lOC(A,k3,s3,l3).";
.
.
.

for (int i=0; i<inputStr.lenght; i++)
{
/////////How to extract ... from inputStr[i]?????
    int R=x;
    int COV=y;
    int M=m;
    string LB += "sb lb" + " ";
    string LOC += "k s l" + " ";
////
////processing...
/////
}



See example for inputStr[2]:

int R=5;
int COV=4;
int M=3;
string LB = "sb2 lb2 sb3 lb3 ";
string LOC = "k2 s2 l2 ";

解决方案

This problem can be solved using regular expressions.
The expressions are a bit complex, but the following code is pretty straight forward.

In order to be able to understand and change the regular expressions in the future, I recommend to study the subject and also to download a tool that can be used to test the expressions.

I usually recommend this site for learning regex: Regular Expressions Info/[^]
And I use this tool myself: Regex Test[^]

Now for the solution
I opted for creating two expressions.

The first expression will parse the whole string, pick out x and y but will treat the end of the string as one sub string.
[Rx][MAX COV=y]A: DEL(A,n(m,0)),LB(A,sb,lb),lOC(A,k,s,l).

^\[R(?<x>[0-9]+)\]\[MAX COV=(?<y>[0-9]+)\]A:\s</y></x>




The second expression will use the end part, DEL(A,n(m,0)),LB(A,sb,lb),LOC(A,k,s,l), as input and extract the different sub-strings

((?=(?<name>DEL)\(A,n\((?<m>[0-9]+),0\)\))|(?=(?<name>LB)\(A,sb(?<sb>[0-9]+),lb(?<lb>[0-9]+)\))|(?=(?<name>LOC)\(A,k(?<k>[0-9]+),s(?<s>[0-9]+),l(?<l>[0-9]+)\)))(,|.)



The two expressions are using named groups to make it easy to extract the variables later in code.

To make it easier to handle the various variables, I created some structs

public struct DEL
{
    public DEL(int m)
    {
        this.m = m;
    }

    public int m { get; set; }
}

public struct LB
{
    public LB(int lb, int sb)
    {
        this.lb = lb;
        this.sb = sb;
    }

    public int lb { get; set; }
    public int sb { get; set; }
}

public struct LOC
{
    public LOC(int k, int s, int l)
    {
        this.k = k;
        this.s = s;
        this.l = l;
    }

    public int k { get; set; }
    public int s { get; set; }
    public int l { get; set; }
}

public class ExtractedVariables
{
    public ExtractedVariables()
    {
        x = 0;
        y = 0;
        del = new DEL();         // There can be only one
        lbList = new List<LB>();
        locList = new List<LOC>();
    }

    public int x { get; set;}
    public int y { get; set; }

    public DEL del { get; set; }

    public List<LB> lbList { get; set; }

    public List<LOC> locList { get; set; }
}



The regular expressions I declared as static member variables

private static Regex stringExpression = new Regex("^\\[R(?<x>[0-9]+)\\]\\[MAX COV=(?<y>[0-9]+)\\]A:\\s(?<vars>[\\S\\s]+)


"); private static Regex variableExpression = new Regex("((?=(?<name>DEL)\\(A,n\\((?<m>[0-9]+),0\\)\\))|(?=(?<name>LB)\\(A,sb(?<sb>[0-9]+),lb(?<lb>[0-9]+)\\))|(?=(?<name>LOC)\\(A,k(?<k>[0-9]+),s(?<s>[0-9]+),l(?<l>[0-9]+)\\)))(,|.)");



Then a little method that handles one string at the time

public ExtractedVariables ExctractVariables(string input)
{
    ExtractedVariables result = new ExtractedVariables();

    Match m = stringExpression.Match(input);
    if (m.Success)
    {
        result.x = int.Parse(m.Groups["x"].Value);
        result.y = int.Parse(m.Groups["y"].Value);

        string variables = m.Groups["vars"].Value;

        foreach (Match mSub in variableExpression.Matches(variables))
        {
            string name = mSub.Groups["name"].Value;
            switch (name)
            {
                case "DEL":
                    result.del = new DEL(int.Parse(mSub.Groups["m"].Value));
                    break;

                case "LB":
                    result.lbList.Add(new LB(int.Parse(mSub.Groups["lb"].Value), int.Parse(mSub.Groups["sb"].Value)));
                    break;

                case "LOC":
                    result.locList.Add(new LOC(int.Parse(mSub.Groups["k"].Value), int.Parse(mSub.Groups["s"].Value), int.Parse(mSub.Groups["l"].Value)));
                    break;
                default: throw new Exception(string.Format("Unknown sub string name '{0}'.", name));
            }
        }
    }

    return result;
}



And at last some test code

string[] inputStr = new string[3];
inputStr[0] = "[R1][MAX COV=2]A: DEL(A,n(2,0)),LB(A,sb1,lb1),LOC(A,k1,s1,l1).";
inputStr[1] = "[R5][MAX COV=4]A: LB(A,sb2,lb2),LOC(A,k2,s2,l2),DEL(A,n(3,0)),LB(A,sb3,lb3).";
inputStr[2] = "[R12][MAX COV=2]A: DEL(A,n(1,0)),LB(A,sb3,lb3),LOC(A,k3,s3,l3),lOC(A,k3,s3,l3).";
foreach (string s in inputStr)
{
    ExtractedVariables result = ExctractVariables(s);
    Debug.WriteLine("x = {0}", result.x);
    Debug.WriteLine("y = {0}", result.y);
    Debug.WriteLine("\tm = {0}", result.del.m);

    foreach (LB lb in result.lbList)
    {
        Debug.WriteLine("\tlb = {0}\tsb = {1}", lb.lb, lb.sb);
    }

    foreach (LOC loc in result.locList)
    {
        Debug.WriteLine("\tk = {0}\ts = {1}\tl = {2}", loc.k, loc.s, loc.l);
    }
}



[UPDATE]
To extract sb<x> and lb<x> as strings, just move lb and sb inside the respective group.

private static Regex variableExpression = new Regex("((?=(?<name>DEL)\\(A,n\\((?<m>[0-9]+),0\\)\\))|(?=(?<name>LB)\\(A,(?<sb>sb[0-9]+),(?<lb>lb[0-9]+)\\))|(?=(?<name>LOC)\\(A,k(?<k>[0-9]+),s(?<s>[0-9]+),l(?<l>[0-9]+)\\)))(,|.)");



Then of course, you need to change the struct

public struct LB
{
    public LB(string lb, string sb)
    {
        this.lb = lb;
        this.sb = sb;
    }
 
    public string lb { get; set; }
    public string sb { get; set; }
}


and the LB case

case "LB":
    result.lbList.Add(new LB(mSub.Groups["lb"].Value, mSub.Groups["sb"].Value));
    break;


Parsing your example is non-trivial. Read this book.

https://en.wikipedia.org/wiki/Compilers:_Principles,_Techniques,_and_Tools[^]

To keep things simple, focus on writing a left-matching recursive descent parser.


这篇关于如何从输入字符串中提取子字符串?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆