Replication Undelivered Commands: Monitoring and Reporting

on June 7, 2009

This post is to share a script I’ve been working on periodically over the last couple of months to monitor and report on replication latency at the distributor.

I use this in monitoring transactional replication with a stand-alone distributor.

The basic purpose for the script is:

  • Check the number of undelivered commands for all active publications
  • Log the data found to a table (which is collected on a centralized server so the history can be displayed graphically in Reporting Services 2008– that code not in this post)
  • When the number of undelivered commands is  higher than allowed, fail the job and send an email with a table highlighting the publications that are behind
  • Accept a parameter of dynamic length to allow specifying publications that should not be monitored as “production” (ie, do not alert in monitoring). Our repl servers are not SQL 2008 yet, so I use an XML wad rather than a TVP. I do love TVP’s for this type of thing though.
  • And be able to accomplish all these tasks from a non-system database.  (No custom monitoring code in system or application dbs!)

So this script shows not only how to check for undelivered commands, but also how to build a quick and easy HTML table to summarize data in emails. I always really like having as much data as possible about the state of the system in an email which is requesting investigation. (For monitoring tie in/SQL job failures are written to the event log which are then picked up.)

Note: there are some email addresses at the bottom of the sproc which I should have parameterized.  I am leaving this way because everyone should read the sproc thoroughly before installing it anywhere ;)

The table for Logging

I pull all results to a centralized server, so this table stays small

if object_id('dba.ReplStatusHistory') is null
create table dba.replStatusHistory (
 replStatusId bigint identity primary key
 , replStatusTime datetime default (getdate()) not null
 , distributionDb sysname not null
 , publishingServer sysname not null
 , publishingDb sysname not null
 , subscribingServer sysname not null
 , subscribingDb sysname not null
 , subscriptionStatusId tinyint not null
 , PendingCmdCount bigint not null
 , exception bit default 0
 , publicationName sysname not null
)

The Sproc

--Usage sample:
--exec [dba].[monitorReplStatus] @testMode=1, @distributionDb='distribution', @sendEmail=2

CREATE PROCEDURE [dba].[monitorReplStatus]
 @nocEmailThreshold int = 100000
 , @testMode bit =0
 , @distributionDb sysname
 , @testModeEmailAddress nvarchar(1000) = 'some@body.com'
 , @sendEmail tinyint = 2 -- 0= don't mail, 1= always mail, 2=email only on error
 , @exceptionXML xml = N'
<EXCEPTION>
 <Exclude serverName="SOMESERVER" />
 <Exclude serverName="SOMEOTHERSERVER" />
 </EXCEPTION>
'
AS

SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
SET NOCOUNT ON;

declare @html nvarchar(max)
 , @emailRecipients nvarchar(2000)
 , @subj nvarchar(2000)
 , @replStatusTime datetime
 , @loggingWarning nvarchar(max)
 , @sqlGetSubscribers nvarchar(max)
 , @raiseErr bit
 , @i tinyint --iterator for loop
 , @publishingServer sysname
 , @publishingDb sysname
 , @publicationName sysname
 , @subscriptionType tinyint
 , @subscribingServer sysname
 , @subscribingDb sysname
 , @getStatsSql nvarchar(max)

set @replStatusTime=getdate();

declare @replSubscribers table (
 replSubscriberId int identity primary key
 , publishingServer sysname not null
 , publishingDb sysname not null
 , publicationName sysname not null
 , subscriptionType tinyint default (1)
 , subscribingServer sysname not null
 , subscribingDb sysname not null
 , subscriptionStatusId tinyint not null
 , subscriptionStatus nvarchar(256) not null
 , exception bit default(0)
 , pendingCmdCount bigInt null
 , estimatedProcessSeconds bigInt null
)

declare @replStatus table (
 replSubscriberId int primary key
 , pendingCmdCount bigInt
 , estimatedProcessSeconds bigInt
)

declare @exceptions table (
 serverName sysname not null
)

set @subj='Replication Status from ' + @@SERVERNAME + '.' + @distributionDb;
set @loggingWarning='' -- can't be null or it won't work later
set @raiseErr=0 -- should not be null

--First, parse exclusions
insert @exceptions (serverName)
SELECT DISTINCT
 serverName = x.serverName
FROM
 (
 SELECT DISTINCT
 serverName = e.i.value('@serverName','sysname')
 FROM @exceptionXML.nodes('EXCEPTION/Exclude') e(i)
 where e.i.value('@serverName','sysname')&nbsp; is not null
 ) x

--Now, get our list of subscribers for this distribution db
select @sqlGetSubscribers='
use ' + @distributionDb + ';
select
 publishingServer = pub.srvname
 , publishingDb = coalesce(sb.publisher_db,''????'')
 , publicationName = pb.publication
 , subscriptionType= sb.subscription_type
 , subscribingServer = sub.srvname
 , subscribingDb= coalesce(sb.subscriber_db,''????'')
 , subscriptionStatusId=coalesce(sb.status,-1)
 , subscriptionStatus= case sb.status when 0 then ''Inactive'' when 1 then ''Subscribed'' when 2 then ''Active'' else ''???'' end
from MSSubscriptions sb (nolock)
join MSPublications pb (nolock) on
 sb.publication_id=pb.publication_id
join master.dbo.sysservers sub (nolock) on
 sb.subscriber_id= sub.srvid
join master.dbo.sysservers pub (nolock) on
 sb.publisher_id= pub.srvid
group by
 pub.srvname
 , sb.publisher_db
 , pb.publication
 , sb.subscription_type
 , sub.srvname
 , sb.subscriber_db
 , sb.status
'
if @testMode=1 print @sqlGetSubscribers

begin try
 insert @replSubscribers (publishingServer, publishingDb, publicationName, subscriptionType, subscribingServer, subscribingDb, subscriptionStatusId, subscriptionStatus)
 exec sp_executesql @sqlGetSubscribers

 if @@rowcount =0
 begin
 set @loggingWarning=@loggingWarning + 'No subscribers found.'
 set @raiseErr=1
 end
 else
 update @replSubscribers
 set exception =1
 from @replSubscribers&nbsp; rs
 join @exceptions x on
 rs.subscribingServer=x.serverName

end try
begin catch
 set @loggingWarning=@loggingWarning + 'Could not pull subscriber list.'
 set @raiseErr=1
end catch

set @i=1;

--Now get the undelivered commands for each subscriber

while @raiseErr != 1 and @i = @nocEmailThreshold and exception =0) >= 1
begin
 set @sendEmail=1
end

--Fix up the email and send it
if @sendEmail=1
begin

 SELECT @html=
 '

 BODY&nbsp; {background-color:floralwhite; font-family: sans-serif}
 TABLE {border-width: 1px;border-style: solid;border-color: black;border-collapse: collapse;}
 TH&nbsp;&nbsp;&nbsp; {border-width: 1px;padding: 2px;border-style: solid;border-color: black; background-color:#f0f0f0; text-align: center; padding: 8px;}
 TD&nbsp;&nbsp;&nbsp; {border-width: 1px;padding: 2px;border-style: solid;border-color: black; background-color:#f9f9f9; text-align: center; padding: 8px;}

 <H1>PROD Replication Undelivered Commands</H1>'
 + case when @loggingWarning is not null then '<H3><font color="red">' + @loggingWarning + '</font></H3>' else '' end
 + '
Replication status from ' + @@SERVERNAME + '.' + @distributionDb
 + ' is summarized below. Please ticket and escalate any instances of undelivered commands for production servers over <font color="red">'
 + cast(@nocEmailThreshold&nbsp; as nvarchar) + '</font> as as per the <a href="http://yourTSG/OrOpsGuide/'+ @@SERVERNAME + '">'+ @@SERVERNAME +' wiki</a>.

<H2>Production Servers</H2>
 '
 --Build the upper table to display production data

 SELECT
 @html=@html + '
 <TABLE>
 <TR>
 <TH>
 publishing server
 </TH><TH>
 publishing db
 </TH><TH>
 subscribing server
 </TH><TH>
 subscribing db
 </TH><TH>
 publication name
 </TH><TH>
 subscription status
 </TH><TH>
 undelivered commands
 </TH>
 </TR>
 ';

 SELECT
 @html=@html +
 '<TR>
 <TD>' + publishingServer
 + '</TD>
 <TD>'+
 publishingDb
 + '</TD>
 <TD>'+
 subscribingServer
 + '</TD>
 <TD>'+
 subscribingDb
 + '</TD>
 <TD>'+
 publicationName
 + '</TD>
 <TD>'+
 subscriptionStatus
 + '</TD>
 <TD>'
 + case when coalesce(PendingCmdCount,-1) >= @nocEmailThreshold then '<font color="red">' else '' end
 + cast (coalesce(PendingCmdCount,-1) as nvarchar(256))
 + case when (coalesce(PendingCmdCount,-1) >= @nocEmailThreshold ) then '</font>' else '' end
 + '</TD>
 </TR>
 '
 from @replSubscribers sub
 where exception = 0
 order by
 publishingServer
 , publishingDb
 , subscribingServer
 , subscribingDb

 SELECT @html=@html +
 '</TABLE>'

 --Build the lower table to display exceptions (which are just an FYI)
 SELECT
 @html=@html + '
<H2>Non-Production Servers (informational only, do not ticket)</H2>
'

 SELECT
 @html=@html + '
 <TABLE>
 <TR>
 <TH>
 publishing server
 </TH><TH>
 publishing db
 </TH><TH>
 subscribing server
 </TH><TH>
 subscribing db
 </TH><TH>
 publication name
 </TH><TH>
 subscription status
 </TH><TH>
 undelivered commands
 </TH>
 </TR>
 ';

 SELECT
 @html=@html +
 '<TR>
 <TD>' + publishingServer
 + '</TD>
 <TD>'+
 publishingDb
 + '</TD>
 <TD>'+
 subscribingServer
 + '</TD>
 <TD>'+
 subscribingDb
 + '</TD>
 <TD>'+
 publicationName
 + '</TD>
 <TD>'+
 subscriptionStatus
 + '</TD>
 <TD>'
 -- don't color these red, they have exceptions....
 + cast (coalesce(PendingCmdCount,-1) as nvarchar(256))
 + '</TD>
 </TR>
 '
 from @replSubscribers sub
 where exception = 1
 order by
 publishingServer
 , publishingDb
 , subscribingServer
 , subscribingDb

 SELECT @html=@html +
 '</TABLE>'

 --Build the foooter
 SELECT
 @html=@html + '
'

 if @testMode=1
 begin
 select * from @exceptions
 select * from @replSubscribers sub

 select
 @emailRecipients = @testModeEmailAddress

 end
 else
 select
 @emailRecipients= 'some@body.com' + case when count(*) > 0 then ';some@body.com' else '' end
 from @replSubscribers sub
 where PendingCmdCount >= @nocEmailThreshold
 and exception =0

 exec msdb..sp_send_dbmail
 @profile_name=@@SERVERNAME
 , @recipients = @emailRecipients
 , @subject= @subj
 , @body_format = 'HTML'
 , @body=@html
 ;

end

--Clean up history
delete from dba.ReplStatusHistory
where replStatusTime = @nocEmailThreshold and exception =0) >= 1
begin
 set @raiseErr=1
 set @loggingWarning=@loggingWarning+ 'Error threshold exceeded for one or more subscriptions, please investigate.'
end

--Finish up and raise an error if we need to
if @raiseErr=1
begin
 print 'Logging warning: ' + @loggingWarning
 RAISERROR(@loggingWarning,16,1)
end