从 SQL Server 记录解析 #Hashtag 注释 [英] Parse #Hashtag Comments From SQL Server Record

查看:55
本文介绍了从 SQL Server 记录解析 #Hashtag 注释的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试从 SQL Server 2014 数据库中的文本字段中提取所有#"注释.我正在使用我在这个

示例数据:

CREATE TABLE #temptable ( [ticketDescription] varchar(max), [RetVal] varchar(max) )插入 #temptable价值观('耐用的硬卡退货-要打电话来取号','耐用'),('订单:30341143OrderItemId:30517890许可证:NATHAN TIMOTHY SUMNER 印刷时间:2017 年 4 月 10 日 06:43:57:857 发货至:7650 KIOWA ST, APT #01 MILLINGTON59 TN',3',('订单:30341143OrderItemId:30517890许可证:NATHAN TIMOTHY SUMNER 印刷日期:2017 年 4 月 10 日 06:43:57:857 发货至:7650 KIOWA ST, APT #01 MILLINGTON 103 TN',3('订单:30346281OrderItemId:30526511许可证:STANLEY R ROWLAND 印刷时间:2017 年 4 月 7 日 06:22:23:417 运送至:25 COUNTRY WOOD LN # 601 WALN83 - MS,53',53('订单:30347906OrderItemId:30529325许可证:DOUGLAS R EASTRIDGE 印刷日期:2017 年 4 月 7 日 09:29:51:643 运送至:7980 HUFFS FERRY RD N # R LOUDON ','-'-('订单:30361947OrderItemId:30554547License for:BLAKE R HADDON 印刷时间:2017 年 4 月 7 日 14:01:42:637 运送至:146 14TH AVE NW # 146,WINCHESTER,'1098('订单:30362075OrderItemId:30554740许可证:KYLE JACKSON 印刷时间:2017 年 4 月 7 日 14:04:23:473 运送至:3765 E ANDREW JOHNSON HWY,APT # A4 MORRISTOWN,'1008 年'或('订单:30369152OrderItemId:30565137许可证:DANIEL JAMES SOLA 印刷时间:2017 年 4 月 9 日 07:08:28:683 发货至:3212 MAYES LOOP RD # 1 PIGEON FORGE','-'-JAMES SOLA',('订单:30370097OrderItemId:30566543License for:JAMES D JOHNSON 印刷时间:2017 年 4 月 9 日 11:51:37:170 发货至:230 FRANKLIN RD # 907 FRANKLIN Or665', 220')('订单:30372876OrderItemId:30571083许可证:MARCOS CLAUDIO P POLONIATO 印刷日期:2017 年 4 月 6 日 08:15:01:093 发货至:295 WINDING RIVER PR, #J 300'S 603', SGADY('订单:30396415OrderItemId:30604206许可证:GARY T GOODMAN 印刷时间:2017 年 4 月 7 日 15:11:20:317 发货至:1046 GREENBRIAR RD # RD.423 TALBOTT ','37057TN('订单:30405689OrderItemId:30617970许可证:VANCE K JOHNSON 印刷时间:2017 年 4 月 9 日 09:25:48:670 运送至:614 GARRISON HOLLOW RD, LOT # 11:TN73489SELECT * FROM #temptable;删除表#temptable

<小时><块引用>

编辑 - 修复

选择HashTag = '#'+B.RetVal,Cnt = 计数(*)从#temptable A交叉应用 [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control]('#>>>'+A.TicketDescription)+','#',' ') 乙其中 B.RetVal <>'>>>'和 B.RetVal 不像 '[0-9][0-9]'按 B.RetVal 分组按 1 订购

解决方案

借助两个函数和一个 CROSS APPLY,以下内容可能会有所帮助.

第一个函数将去除所有控制字符并替换为空格,以免连接诸如 John{13}{10}Smith 之类的单词会返回为 John Smith

第二个函数是一个修改后的解析函数,它接受两个不同的分隔符(开始/结束).在您的情况下,# 和空格.如果有多个hashtag,会返回多条记录.

示例

选择HashTag = '#'+B.RetVal,Cnt = 计数(*)从趋势 A交叉应用 [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control](A.TicketDescription)+' ','#',' ') B按 B.RetVal 分组按 1 订购

退货

HashTag Cnt#回调 1#nochargereprint 1#nocostreprint 1#注释 1#paperlicense 1 ---<<文本中的第二个主题标签#权限 1#printerissue 1#问题1#注册错误 1#registrationnoreceived 1#注册未收到1#registrationreprint 1#重印2#SSNdiscrepancy 1 ---<<文本中的第二个主题标签#更新12#更新信息 6#void 1

如果对 UDF 感兴趣

CREATE FUNCTION [dbo].[udf-Str-Strip-Control](@S varchar(max))返回 varchar(max)开始;with cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),cte2(C) As (Select Top (32) Char(Row_Number() over (Order By (Select NULL))-1) From cte1 a,cte1 b)选择@S = 替换(@S,C,' ')来自 cte2返回 LTrim(RTrim(Replace(Replace(Replace(Replace(@S,'','><'),'<>',''),'><','')))结尾--选择[dbo].[udf-Str-Strip-Control]('Michael '+char(13)+char(10)+'LastName') --返回:Michael LastName

<小时>

CREATE FUNCTION [dbo].[udf-Str-Extract] (@String varchar(max),@Delimiter1 varchar(100),@Delimiter2 varchar(100))退货表作为返回 (with cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1))) N(N)),cte2(N) As (Select Top (IsNull(DataLength(@String),0)) Row_Number() over (Order By (Select NULL)) From (Select N=1 From cte1 N1,cte1 N2,cte1 N3,cte1 N4,cte1 N5,cte1 N6) A ),cte3(N) As (Select 1 Union All Select t.N+DataLength(@Delimiter1) From cte2 t Where Substring(@String,t.N,DataLength(@Delimiter1)) = @Delimiter1),cte4(N,L) As (Select S.N,IsNull(NullIf(CharIndex(@Delimiter1,@String,s.N),0)-S.N,8000) 从 cte3 S)选择 RetSeq = Row_Number() over (Order By N),RetPos = N,RetVal = left(RetVal,charindex(@Delimiter2,RetVal)-1)From (Select *,RetVal = Substring(@String, N, L) From cte4) A其中 charindex(@Delimiter2,RetVal)>1)/*最大字符串长度 1MM 字符声明 @String varchar(max) = 'Dear [[FirstName]] [[LastName]], ...'Select * From [dbo].[udf-Str-Extract] (@String,'[[',']]')*/

<块引用>

编辑 - 可能有助于可视化

如果您在没有任何聚合的情况下运行查询

选择A.*,B.*从趋势 A交叉应用 [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control](' '+replace(A.TicketDescription,'#','|||#'))+' .','|||',' ') B按 1 订购

你会得到

I am trying to pull all "#" comments from a text field in a SQL Server 2014 database. I am using the code I found on this MSDN thread to find multiple hashtags in one record and have modified it slightly to fit my needs, but I seeing some unexpected results in my result set.

My T-SQL so far:

IF OBJECT_ID('tempdb..#hashtag') IS NOT NULL DROP TABLE #hashtag;

IF OBJECT_ID('tempdb..#numbers') IS NOT NULL DROP TABLE #numbers;

IF OBJECT_ID('tempdb..#hashtagcounts') IS NOT NULL DROP TABLE #hashtagcounts;

CREATE TABLE #numbers ( N INT );
DECLARE @i INT; 
SET @i = 1;
WHILE @i < 500
BEGIN
    INSERT  #numbers
    VALUES  ( @i );
    SET @i = @i + 1;
END;

CREATE TABLE #hashtag ( tkt_desc VARCHAR(MAX) );

INSERT INTO #hashtag ( tkt_desc )
SELECT  cst.ticketDescription
FROM    dbo.Trending AS [cst]
    WHERE   cst.ticketDescription LIKE '%#%'
            AND LTRIM(RTRIM(cst.ticketDescription)) NOT LIKE '%# %'
            AND cst.ticketDescription NOT LIKE '%BATCH #%'
            AND cst.ticketDescription NOT LIKE '%#[.:''1-9]%';

SELECT  LOWER ('#' + SUBSTRING(tkt_desc, N,
              CASE WHEN CHARINDEX(' ', tkt_desc, N) > 0
                   THEN CHARINDEX(' ', tkt_desc, N) - N
                   ELSE LEN(tkt_desc)
              END)) AS tkt_desc
INTO #hashtagcounts
FROM    #hashtag
    CROSS JOIN #numbers
WHERE   N <= LEN(tkt_desc) AND SUBSTRING(tkt_desc, N - 1, 1) = '#';

SELECT tkt_desc, COUNT(*) AS [Count]
FROM #hashtagcounts
GROUP BY tkt_desc
ORDER BY Count DESC, tkt_desc;

And my data set look like:

tkt_desc              Count
#updateinfo           6
#update               4
#update update        3
#update updated       3
#reprint              2
#callback             1
#nochargereprint      1
#nocostreprint        1
#notes update         1
#paperlicense please  1

I am experiencing issues where the #update tag is represented three different ways. Ideally, I would like not include anything that come after a blank space ' ' in the hashtag - and the #update tag should see a count of 10.

My initial thought was that since this is a text field, there may be line break or carriage return in the field, so I tried to handle this by replacing SELECT cst.ticketDescription with REPLACE(REPLACE(cst.ticketDescription,CHAR(13),''),CHAR(10),'') but that simply combines the separate words into one hashtag. See example below:

#updateinfo    6
#update        4
#updateupdate  3
#updateupdated 3

Any suggestions on how I might achieve my desired result? And I am including some sample data below in case anyone would like to experiment.

Query that I decided on using the two Functions / Cross Apply and cleaned up the LIKE and NOT LIKE statements:

SELECT  '#' + LOWER(B.RetVal) AS [HashTag] ,
    COUNT(*) AS [Cnt]
FROM    dbo.Common_SupportTickets AS [cst]
    CROSS APPLY [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control](cst.ticketDescription) + ' ', '#', ' ') AS [B]
WHERE   cst.ticketDescription LIKE '%#%'
    AND LTRIM(RTRIM(REPLACE(cst.ticketDescription,CHAR(13)+CHAR(10),''))) NOT LIKE '%#'
    AND cst.ticketDescription NOT LIKE '%BATCH #%'
    AND cst.ticketDescription NOT LIKE '%#[.:'')1-9]%'
    AND CAST(cst.createDate AS DATE) >= CAST( @paramStartDate AS DATE )
    AND CAST(cst.createDate AS DATE) <= CAST( @paramEndDate AS DATE )
GROUP BY B.RetVal

Sample data and records:

USE [Sandbox]
GO

SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[Trending](
[TicketDescription] [varchar](max) NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]

GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#registrationnotreceived     customer has not received registration for boat...')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#registrationnoreceived      Customer called and still has not received duplicate registration...')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#callback (111) 111-1111

Agent''s POS is briefly turning on before "going to sleep" and entering sleep mode. Agent claims POS will not stay active for any length of time.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Corrected last name and driver''s license number.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Update customer''s last name.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Update last name, address')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#update - Profile updated. Corrected last name.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#question')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Update residency status')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#update #SSNdiscrepancy 
John Doe called in claiming this was their SSN, please advise. Please contact John Doe at this number (111-111-2222) when the issue is resolved. He wishes to create an account once the issue is resolved.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Notes
Update Customer''s Hunter certificate number')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#update
Updated residency status')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Changed residency from in-state to out-of-state, likely didn''t update.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Updated Customer''s last name')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#updateinfo')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#void - Agent called in asking to void a duplicate license sale.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update - updated customer''s last name')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#permissions
Changed agent role from AGENT CLERK to AGENT MANAGER in order to order supplies.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#registrationreprint customer didn''t receive registration I sent to Twra It for reprint. Told to call if he has not received in 10days ')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#printerissue')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#update - Profile updated. Religious Exempt.')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#reprint   this is 2nd call from customer that they have not received there boat registration...')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#registratedincorrectly  He send in check and info from Clerks office beginning of Dec, ')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#reprint #paperlicense Please reprint this license for the customer, he claims he has not yet received it. ')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#nocostreprint customer did not receive boat regst')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#Update
Updated customer''s address over the phone')
GO
INSERT [dbo].[Trending] ([TicketDescription]) VALUES (N'#nochargereprint  ')
GO

Just noticed a strange behavior when I ran this same script for a different date range - it's creating #hashtags that do not exist. For example, it is returning Order: 10 times but there is not text within the field for #order:

Sample data:

CREATE TABLE #temptable ( [ticketDescription] varchar(max), [RetVal] varchar(max) )
INSERT INTO #temptable
VALUES
( 'DURABLE HARD CARD RETURN-WAS GOING TO CALL TO PICK UP  NO# NUMBER ', 'DURABLE' ), 
( 'Order: 30341143OrderItemId: 30517890License for: NATHAN TIMOTHY SUMNER  Printed on: 10 Apr 2017 06:43:57:857 Shipped to: 7650 KIOWA ST, APT #01 MILLINGTON TN 38053 - 3219', 'Order:' ), 
( 'Order: 30341143OrderItemId: 30517890License for: NATHAN TIMOTHY SUMNER  Printed on: 10 Apr 2017 06:43:57:857 Shipped to: 7650 KIOWA ST, APT #01 MILLINGTON TN 38053 - 3219', '01' ), 
( 'Order: 30346281OrderItemId: 30526511License for: STANLEY R ROWLAND  Printed on: 07 Apr 2017 06:22:23:417 Shipped to: 25 COUNTRY WOOD LN # 601 WALNUT MS 38683 - 5367', 'Order:' ), 
( 'Order: 30347906OrderItemId: 30529325License for: DOUGLAS R EASTRIDGE  Printed on: 07 Apr 2017 09:29:51:643 Shipped to: 7980 HUFFS FERRY RD N # R LOUDON TN 37774 - 5910', 'Order:' ), 
( 'Order: 30361947OrderItemId: 30554547License for: BLAKE R HADDON  Printed on: 07 Apr 2017 14:01:42:637 Shipped to: 146 14TH AVE NW # 146 WINCHESTER TN 37398 - 1079', 'Order:' ), 
( 'Order: 30362075OrderItemId: 30554740License for: KYLE  JACKSON  Printed on: 07 Apr 2017 14:04:23:473 Shipped to: 3765 E ANDREW JOHNSON HWY, APT # A4 MORRISTOWN TN 37814 - 6200', 'Order:' ), 
( 'Order: 30369152OrderItemId: 30565137License for: DANIEL JAMES SOLA  Printed on: 09 Apr 2017 07:08:28:683 Shipped to: 3212 MAYES LOOP RD # 1 PIGEON FORGE TN 37863 - 7721', 'Order:' ), 
( 'Order: 30370097OrderItemId: 30566543License for: JAMES D JOHNSON  Printed on: 09 Apr 2017 11:51:37:170 Shipped to: 230 FRANKLIN RD # 907 FRANKLIN TN 37064 - 2256', 'Order:' ), 
( 'Order: 30372876OrderItemId: 30571083License for: MARCOS  CLAUDIO P POLONIATO  Printed on: 06 Apr 2017 08:15:01:093 Shipped to: 295 WINDING RIVER DR, # J SANDY SPRINGS GA 30350 - 1926', 'Order:' ), 
( 'Order: 30396415OrderItemId: 30604206License for: GARY T GOODMAN  Printed on: 07 Apr 2017 15:11:20:317 Shipped to: 1046 GREENBRIAR RD # RD.423 TALBOTT TN 37877 - 9055', 'Order:' ), 
( 'Order: 30405689OrderItemId: 30617970License for: VANCE K JOHNSON  Printed on: 09 Apr 2017 09:25:48:670 Shipped to: 614 GARRISON HOLLOW RD, LOT # 11 ELIZABETHTON TN 37643 - 4897', 'Order:' )

SELECT * FROM #temptable;

DROP TABLE #temptable


EDIT - To Fix

Select HashTag = '#'+B.RetVal
      ,Cnt      = Count(*)
 From  #temptable A
 Cross Apply [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control]('#>>> '+A.TicketDescription)+' ','#',' ') B
 Where B.RetVal <> '>>>'
   and B.RetVal Not Like '[0-9][0-9]'
 Group By B.RetVal
 Order By 1

解决方案

With the help of two function and a CROSS APPLY, the following may help.

The first function will strip all control characters and replace with a space so not to concatinate words like John{13}{10}Smith would return as John Smith

The second function is a modified parse function to accept two non-like delimiters (begin/end). In your case a # and space. If there are multiple hashtags, it will return multiple records.

Example

Select HashTag = '#'+B.RetVal
      ,Cnt      = Count(*)
 From  trending A
 Cross Apply [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control](A.TicketDescription)+' ','#',' ') B
 Group By B.RetVal
 Order By 1

Returns

HashTag                 Cnt
#callback               1
#nochargereprint        1
#nocostreprint          1
#Notes                  1
#paperlicense           1   ---<< 2nd hashtag in text
#permissions            1
#printerissue           1
#question               1
#registratedincorrectly 1
#registrationnoreceived 1
#registrationnotreceived1
#registrationreprint    1
#reprint                2
#SSNdiscrepancy         1   ---<< 2nd hashtag in text
#Update                 12
#updateinfo             6
#void                   1

If UDFs if Interested

CREATE FUNCTION [dbo].[udf-Str-Strip-Control](@S varchar(max))
Returns varchar(max)
Begin
    ;with  cte1(N) As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
           cte2(C) As (Select Top (32) Char(Row_Number() over (Order By (Select NULL))-1) From cte1 a,cte1 b)
    Select @S = Replace(@S,C,' ')
     From  cte2

    Return LTrim(RTrim(Replace(Replace(Replace(@S,' ','><'),'<>',''),'><',' ')))
End
--Select [dbo].[udf-Str-Strip-Control]('Michael        '+char(13)+char(10)+'LastName')  --Returns: Michael LastName


CREATE FUNCTION [dbo].[udf-Str-Extract] (@String varchar(max),@Delimiter1 varchar(100),@Delimiter2 varchar(100))
Returns Table 
As
Return (  

with   cte1(N)   As (Select 1 From (Values(1),(1),(1),(1),(1),(1),(1),(1),(1),(1)) N(N)),
       cte2(N)   As (Select Top (IsNull(DataLength(@String),0)) Row_Number() over (Order By (Select NULL)) From (Select N=1 From cte1 N1,cte1 N2,cte1 N3,cte1 N4,cte1 N5,cte1 N6) A ),
       cte3(N)   As (Select 1 Union All Select t.N+DataLength(@Delimiter1) From cte2 t Where Substring(@String,t.N,DataLength(@Delimiter1)) = @Delimiter1),
       cte4(N,L) As (Select S.N,IsNull(NullIf(CharIndex(@Delimiter1,@String,s.N),0)-S.N,8000) From cte3 S)

Select RetSeq = Row_Number() over (Order By N)
      ,RetPos = N
      ,RetVal = left(RetVal,charindex(@Delimiter2,RetVal)-1) 
 From  (Select *,RetVal = Substring(@String, N, L) From cte4) A
 Where charindex(@Delimiter2,RetVal)>1
)
/*
Max Length of String 1MM characters

Declare @String varchar(max) = 'Dear [[FirstName]] [[LastName]], ...'
Select * From [dbo].[udf-Str-Extract] (@String,'[[',']]')
*/

EDIT - May Help with the Visualization

If you run the query without any aggregegation

Select A.*,B.*
 From  trending A
 Cross Apply [dbo].[udf-Str-Extract]([dbo].[udf-Str-Strip-Control](' '+replace(A.TicketDescription,'#','|||#'))+' .','|||',' ') B
 Order By 1

You'll Get

这篇关于从 SQL Server 记录解析 #Hashtag 注释的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆