mp_stackdiffs.sas
Go to the documentation of this file.
1 /**
2  @file
3  @brief Prepares an audit table for stacking (re-applying) the changes.
4  @details When the underlying data from a Base Table is refreshed, it can be
5  helpful to have any previously-applied changes, re-applied.
6 
7  Such situation might arise if you are applying those changes using a tool
8  like [Data Controller for SASĀ®](https://datacontroller.io) - which records
9  all such changes in an audit table.
10  It may also apply if you are preparing a series of specific cell-level
11  transactions, that you would like to apply to multiple sets of (similarly
12  structured) Base Tables.
13 
14  In both cases, it is necessary that the transactions are stored using
15  the mp_storediffs.sas macro, or at least that the underlying table is
16  structured as per the definition in mp_coretable.sas (DIFFTABLE entry)
17 
18  <b>This</b> macro is used to convert the stored changes (tall format) into
19  staged changes (wide format), with base table values incorporated (in the
20  case of modified rows), ready for the subsequent load process.
21 
22  Essentially then, what this macro does, is turn a table like this:
23 
24 |KEY_HASH:$32.|MOVE_TYPE:$1.|TGTVAR_NM:$32.|IS_PK:best.|IS_DIFF:best.|TGTVAR_TYPE:$1.|OLDVAL_NUM:best32.|NEWVAL_NUM:best32.|OLDVAL_CHAR:$32765.|NEWVAL_CHAR:$32765.|
25 |---|---|---|---|---|---|---|---|---|---|
26 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|` `|`Newbie `|
27 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`AGE `|`0 `|`-1 `|`N `|`. `|`13 `|` `|` `|
28 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`HEIGHT `|`0 `|`-1 `|`N `|`. `|`65.3 `|` `|` `|
29 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|` `|`F `|
30 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`WEIGHT `|`0 `|`-1 `|`N `|`. `|`98 `|` `|` `|
31 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|`Alfred `|` `|
32 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`AGE `|`0 `|`-1 `|`N `|`14 `|`. `|` `|` `|
33 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`HEIGHT `|`0 `|`-1 `|`N `|`69 `|`. `|` `|` `|
34 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|`M `|` `|
35 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`WEIGHT `|`0 `|`-1 `|`N `|`112.5 `|`. `|` `|` `|
36 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`NAME `|`1 `|`0 `|`C `|`. `|`. `|`Alice `|`Alice `|
37 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`AGE `|`0 `|`1 `|`N `|`13 `|`99 `|` `|` `|
38 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`HEIGHT `|`0 `|`0 `|`N `|`56.5 `|`56.5 `|` `|` `|
39 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`SEX `|`0 `|`0 `|`C `|`. `|`. `|`F `|`F `|
40 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`WEIGHT `|`0 `|`0 `|`N `|`84 `|`84 `|` `|` `|
41 
42  Into three tables like this:
43 
44  <b> `work.outmod`: </b>
45  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
46  |---|---|---|---|---|
47  |`Alice `|`F `|`99 `|`56.5 `|`84 `|
48 
49  <b> `work.outadd`: </b>
50  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
51  |---|---|---|---|---|
52  |`Newbie `|`F `|`13 `|`65.3 `|`98 `|
53 
54  <b> `work.outdel`: </b>
55  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
56  |---|---|---|---|---|
57  |`Alfred `|`M `|`14 `|`69 `|`112.5 `|
58 
59  As you might expect, there are a bunch of extra features and checks.
60 
61  The macro supports both SCD2 (TXTEMPORAL) and UPDATE loadtypes. If the
62  base table contains a PROCESSED_DTTM column (or similar), this can be
63  ignored by declaring it in the `processed_dttm_var` parameter.
64 
65  The macro is also flexible where columns have been added or removed from
66  the base table UNLESS there is a change to the primary key.
67 
68  Changes to the primary key fields are NOT supported, and are likely to cause
69  unexpected results.
70 
71  The following pre-flight checks are made:
72 
73  @li All primary key columns exist on the base table
74  @li There is no change in variable TYPE for any of the columns
75  @li There is no reduction in variable LENGTH below the max-length of the
76  supplied values
77 
78  Rules for stacking changes are as follows:
79 
80  <table>
81  <tr>
82  <th>Transaction Type</th><th>Key Behaviour</th><th>Column Behaviour</th>
83  </tr>
84  <tr>
85  <td>Deletes</td>
86  <td>
87  The row is added to `&outDEL.` UNLESS it no longer exists
88  in the base table, in which case it is added to `&errDS.` instead.
89  </td>
90  <td>
91  Deletes are unaffected by the addition or removal of non Primary-Key
92  columns.
93  </td>
94  </tr>
95  <tr>
96  <td>Inserts</td>
97  <td>
98  Previously newly added rows are added to the `outADD` table UNLESS they
99  are present in the Base table.<br>In this case they are added to the
100  `&errDS.` table instead.
101  </td>
102  <td>
103  Inserts are unaffected by the addition of columns in the Base Table
104  (they are padded with blanks). Deleted columns are only a problem if
105  they appear on the previous insert - in which case the record is added
106  to `&errDS.`.
107  </td>
108  </tr>
109  <tr>
110  <td>Updates</td>
111  <td>
112  Previously modified rows are merged with base table values such that
113  only the individual cells that were _previously_ changed are re-applied.
114  Where the row contains cells that were not marked as having changed in
115  the prior transaction, the 'blanks' are filled with base table values in
116  the `outMOD` table.<br>
117  If the row no longer exists on the base table, then the row is added to
118  the `errDS` table instead.
119  </td>
120  <td>
121  Updates are unaffected by the addition of columns in the Base Table -
122  the new cells are simply populated with Base Table values. Deleted
123  columns are only a problem if they relate to a modified cell
124  (`is_diff=1`) - in which case the record is added to `&errDS.`.
125  </td>
126  </tr>
127  </table>
128 
129  To illustrate the above with a diagram:
130 
131  @dot
132  digraph {
133  rankdir="TB"
134  start[label="Transaction Type?" shape=Mdiamond]
135  del[label="Does Base Row exist?" shape=rectangle]
136  add [label="Does Base Row exist?" shape=rectangle]
137  mod [label="Does Base Row exist?" shape=rectangle]
138  chkmod [label="Do all modified\n(is_diff=1) cells exist?" shape=rectangle]
139  chkadd [label="Do all inserted cells exist?" shape=rectangle]
140  outmod [label="outMOD\nTable" shape=Msquare style=filled]
141  outadd [label="outADD\nTable" shape=Msquare style=filled]
142  outdel [label="outDEL\nTable" shape=Msquare style=filled]
143  outerr [label="ErrDS Table" shape=Msquare fillcolor=Orange style=filled]
144  start -> del [label="Delete"]
145  start -> add [label="Insert"]
146  start -> mod [label="Update"]
147 
148  del -> outdel [label="Yes"]
149  del -> outerr [label="No" color="Red" fontcolor="Red"]
150  add -> chkadd [label="No"]
151  add -> outerr [label="Yes" color="Red" fontcolor="Red"]
152  mod -> outerr [label="No" color="Red" fontcolor="Red"]
153  mod -> chkmod [label="Yes"]
154  chkmod -> outerr [label="No" color="Red" fontcolor="Red"]
155  chkmod -> outmod [label="Yes"]
156  chkadd -> outerr [label="No" color="Red" fontcolor="Red"]
157  chkadd -> outadd [label="Yes"]
158 
159  }
160  @enddot
161 
162  For examples of usage, check out the mp_stackdiffs.test.sas program.
163 
164 
165  @param [in] baselibds Base Table against which the changes will be applied,
166  in libref.dataset format.
167  @param [in] auditlibds Dataset with previously applied transactions, to be
168  re-applied. Use libref.dataset format.
169  DDL as follows: %mp_coretable(DIFFTABLE)
170  @param [in] key Space seperated list of key variables
171  @param [in] mdebug= Set to 1 to enable DEBUG messages and preserve outputs
172  @param [in] processed_dttm_var= (0) If a variable is being used to mark
173  the processed datetime, put the name of the variable here. It will NOT
174  be included in the staged dataset (the load process is expected to
175  provide this)
176  @param [out] errds= (work.errds) Output table containing problematic records.
177  The columns of this table are:
178  @li PK_VARS - Space separated list of primary key variable names
179  @li PK_VALS - Slash separted list of PK variable values
180  @li ERR_MSG - Explanation of why this record is problematic
181  @param [out] outmod= (work.outmod) Output table containing modified records
182  @param [out] outadd= (work.outadd) Output table containing additional records
183  @param [out] outdel= (work.outdel) Output table containing deleted records
184 
185  <h4> SAS Macros </h4>
186  @li mf_existvarlist.sas
187  @li mf_getquotedstr.sas
188  @li mf_getuniquefileref.sas
189  @li mf_getuniquename.sas
190  @li mf_islibds.sas
191  @li mf_nobs.sas
192  @li mf_wordsinstr1butnotstr2.sas
193  @li mp_abort.sas
194  @li mp_ds2squeeze.sas
195 
196  <h4> Related Macros </h4>
197  @li mp_coretable.sas
198  @li mp_stackdiffs.test.sas
199  @li mp_storediffs.sas
200  @li mp_stripdiffs.sas
201 
202  @todo The current approach assumes that a variable called KEY_HASH is not on
203  the base table. This part will need to be refactored (eg using
204  mf_getuniquename.sas) when such a use case arises.
205 
206  @version 9.2
207  @author Allan Bowe
208 **/
209 /** @cond */
210 
211 %macro mp_stackdiffs(baselibds
212  ,auditlibds
213  ,key
214  ,mdebug=0
215  ,processed_dttm_var=0
216  ,errds=work.errds
217  ,outmod=work.outmod
218  ,outadd=work.outadd
219  ,outdel=work.outdel
220 )/*/STORE SOURCE*/;
221 %local dbg;
222 %if &mdebug=1 %then %do;
223  %put &sysmacroname entry vars:;
224  %put _local_;
225 %end;
226 %else %let dbg=*;
227 
228 /* input parameter validations */
229 %mp_abort(iftrue= (%mf_islibds(&baselibds) ne 1)
230  ,mac=&sysmacroname
231  ,msg=%str(Invalid baselibds: &baselibds)
232 )
233 %mp_abort(iftrue= (%mf_islibds(&auditlibds) ne 1)
234  ,mac=&sysmacroname
235  ,msg=%str(Invalid auditlibds: &auditlibds)
236 )
237 %mp_abort(iftrue= (%length(&key)=0)
238  ,mac=&sysmacroname
239  ,msg=%str(Missing key variables!)
240 )
241 %mp_abort(iftrue= (
242  %mf_existVarList(&auditlibds,LIBREF DSN MOVE_TYPE KEY_HASH TGTVAR_NM IS_PK
243  IS_DIFF TGTVAR_TYPE OLDVAL_NUM NEWVAL_NUM OLDVAL_CHAR NEWVAL_CHAR)=0
244  )
245  ,mac=&sysmacroname
246  ,msg=%str(Input &auditlibds is missing required columns!)
247 )
248 
249 
250 /* set up macro vars */
251 %local prefix dslist x var keyjoin commakey keepvars missvars fref;
252 %let prefix=%substr(%mf_getuniquename(),1,25);
253 %let dslist=ds1d ds2d ds3d ds1a ds2a ds3a ds1m ds2m ds3m pks dups base
254  delrec delerr addrec adderr modrec moderr;
255 %do x=1 %to %sysfunc(countw(&dslist));
256  %let var=%scan(&dslist,&x);
257  %local &var;
258  %let &var=%upcase(&prefix._&var);
259 %end;
260 
261 %let key=%upcase(&key);
262 %let commakey=%mf_getquotedstr(&key,quote=N);
263 
264 %let keyjoin=1=1;
265 %do x=1 %to %sysfunc(countw(&key));
266  %let var=%scan(&key,&x);
267  %let keyjoin=&keyjoin and a.&var=b.&var;
268 %end;
269 
270 data &errds;
271  length pk_vars $256 pk_vals $4098 err_msg $512;
272  call missing (of _all_);
273  stop;
274 run;
275 
276 /**
277  * Prepare raw DELETE table
278  * Records are in the OLDVAL_xxx columns
279  */
280 %let keepvars=MOVE_TYPE KEY_HASH TGTVAR_NM TGTVAR_TYPE IS_PK
281  OLDVAL_NUM OLDVAL_CHAR
282  NEWVAL_NUM NEWVAL_CHAR;
283 proc sort data=&auditlibds(where=(move_type='D') keep=&keepvars)
284  out=&ds1d(drop=move_type);
285 by KEY_HASH TGTVAR_NM;
286 run;
287 proc transpose data=&ds1d(where=(tgtvar_type='N'))
288  out=&ds2d(drop=_name_);
289  by KEY_HASH;
290  id TGTVAR_NM;
291  var OLDVAL_NUM;
292 run;
293 proc transpose data=&ds1d(where=(tgtvar_type='C'))
294  out=&ds3d(drop=_name_);
295  by KEY_HASH;
296  id TGTVAR_NM;
297  var OLDVAL_CHAR;
298 run;
299 %mp_ds2squeeze(&ds2d,outds=&ds2d)
300 %mp_ds2squeeze(&ds3d,outds=&ds3d)
301 data &outdel;
302  if 0 then set &baselibds;
303  set &ds2d;
304  set &ds3d;
305  drop key_hash;
306  if not missing(%scan(&key,1));
307 run;
308 proc sort;
309  by &key;
310 run;
311 
312 /**
313  * Prepare raw APPEND table
314  * Records are in the NEWVAL_xxx columns
315  */
316 proc sort data=&auditlibds(where=(move_type='A') keep=&keepvars)
317  out=&ds1a(drop=move_type);
318  by KEY_HASH TGTVAR_NM;
319 run;
320 proc transpose data=&ds1a(where=(tgtvar_type='N'))
321  out=&ds2a(drop=_name_);
322  by KEY_HASH;
323  id TGTVAR_NM;
324  var NEWVAL_NUM;
325 run;
326 proc transpose data=&ds1a(where=(tgtvar_type='C'))
327  out=&ds3a(drop=_name_);
328  by KEY_HASH;
329  id TGTVAR_NM;
330  var NEWVAL_CHAR;
331 run;
332 %mp_ds2squeeze(&ds2a,outds=&ds2a)
333 %mp_ds2squeeze(&ds3a,outds=&ds3a)
334 data &outadd;
335  if 0 then set &baselibds;
336  set &ds2a;
337  set &ds3a;
338  drop key_hash;
339  if not missing(%scan(&key,1));
340 run;
341 proc sort;
342  by &key;
343 run;
344 
345 /**
346  * Prepare raw MODIFY table
347  * Keep only primary key - will add modified values later
348  */
349 proc sort data=&auditlibds(
350  where=(move_type='M' and is_pk=1) keep=&keepvars
351  ) out=&ds1m(drop=move_type);
352  by KEY_HASH TGTVAR_NM;
353 run;
354 proc transpose data=&ds1m(where=(tgtvar_type='N'))
355  out=&ds2m(drop=_name_);
356  by KEY_HASH ;
357  id TGTVAR_NM;
358  var NEWVAL_NUM;
359 run;
360 proc transpose data=&ds1m(where=(tgtvar_type='C'))
361  out=&ds3m(drop=_name_);
362  by KEY_HASH;
363  id TGTVAR_NM;
364  var NEWVAL_CHAR;
365 run;
366 %mp_ds2squeeze(&ds2m,outds=&ds2m)
367 %mp_ds2squeeze(&ds3m,outds=&ds3m)
368 data &outmod;
369  if 0 then set &baselibds;
370  set &ds2m;
371  set &ds3m;
372  if not missing(%scan(&key,1));
373 run;
374 proc sort;
375  by &key;
376 run;
377 
378 /**
379  * Extract matching records from the base table
380  * Do this in one join for efficiency.
381  * At a later date, this should be optimised for large database tables by using
382  * passthrough and a temporary table.
383  */
384 data &pks;
385  if 0 then set &baselibds;
386  set &outadd &outmod &outdel;
387  keep &key;
388 run;
389 
390 proc sort noduprec dupout=&dups;
391 by &key;
392 run;
393 data _null_;
394  set &dups;
395  putlog (_all_)(=);
396 run;
397 %mp_abort(iftrue= (%mf_nobs(&dups) ne 0)
398  ,mac=&sysmacroname
399  ,msg=%str(duplicates (%mf_nobs(&dups)) found on &auditlibds!)
400 )
401 
402 proc sql;
403 create table &base as
404  select a.*
405  from &baselibds a, &pks b
406  where &keyjoin;
407 
408 /**
409  * delete check
410  * This is straightforward as it relates to records only
411  */
412 proc sql;
413 create table &delrec as
414  select a.*
415  from &outdel a
416  left join &base b
417  on &keyjoin
418  where b.%scan(&key,1) is null
419  order by &commakey;
420 
421 data &delerr;
422  if 0 then set &errds;
423  set &delrec;
424  PK_VARS="&key";
425  PK_VALS=catx('/',&commakey);
426  ERR_MSG="Rows cannot be deleted as they do not exist on the Base dataset";
427  keep PK_VARS PK_VALS ERR_MSG;
428 run;
429 proc append base=&errds data=&delerr;
430 run;
431 
432 data &outdel;
433  merge &outdel (in=a) &delrec (in=b);
434  by &key;
435  if not b;
436 run;
437 
438 /**
439  * add check
440  * Problems - where record already exists, or base table has columns missing
441  */
442 %let missvars=%mf_wordsinstr1butnotstr2(
443  Str1=%upcase(%mf_getvarlist(&outadd)),
444  Str2=%upcase(%mf_getvarlist(&baselibds))
445 );
446 %if %length(&missvars)>0 %then %do;
447  /* add them to the err table */
448  data &adderr;
449  if 0 then set &errds;
450  set &outadd;
451  PK_VARS="&key";
452  PK_VALS=catx('/',&commakey);
453  ERR_MSG="Rows cannot be added due to missing base vars: &missvars";
454  keep PK_VARS PK_VALS ERR_MSG;
455  run;
456  proc append base=&errds data=&adderr;
457  run;
458  proc sql;
459  delete * from &outadd;
460 %end;
461 %else %do;
462  proc sql;
463  /* find records that already exist on base table */
464  create table &addrec as
465  select a.*
466  from &outadd a
467  inner join &base b
468  on &keyjoin
469  order by &commakey;
470 
471  /* add them to the err table */
472  data &adderr;
473  if 0 then set &errds;
474  set &addrec;
475  PK_VARS="&key";
476  PK_VALS=catx('/',&commakey);
477  ERR_MSG="Rows cannot be added as they already exist on the Base dataset";
478  keep PK_VARS PK_VALS ERR_MSG;
479  run;
480  proc append base=&errds data=&adderr;
481  run;
482 
483  /* remove invalid rows from the outadd table */
484  data &outadd;
485  merge &outadd (in=a) &addrec (in=b);
486  by &key;
487  if not b;
488  run;
489 %end;
490 
491 /**
492  * mod check
493  * Problems - where record does not exist or baseds has modified cols missing
494  */
495 proc sql noprint;
496 select distinct tgtvar_nm into: missvars separated by ' '
497  from &auditlibds
498  where move_type='M' and is_diff=1;
499 %let missvars=%mf_wordsinstr1butnotstr2(
500  Str1=&missvars,
501  Str2=%upcase(%mf_getvarlist(&baselibds))
502 );
503 %if %length(&missvars)>0 %then %do;
504  /* add them to the err table */
505  data &moderr;
506  if 0 then set &errds;
507  set &outmod;
508  PK_VARS="&key";
509  PK_VALS=catx('/',&commakey);
510  ERR_MSG="Rows cannot be modified due to missing base vars: &missvars";
511  keep PK_VARS PK_VALS ERR_MSG;
512  run;
513  proc append base=&errds data=&moderr;
514  run;
515  proc sql;
516  delete * from &outmod;
517 %end;
518 %else %do;
519  /* now check for records that do not exist (therefore cannot be modified) */
520  proc sql;
521  create table &modrec as
522  select a.*
523  from &outmod a
524  left join &base b
525  on &keyjoin
526  where b.%scan(&key,1) is null
527  order by &commakey;
528  data &moderr;
529  if 0 then set &errds;
530  set &modrec;
531  PK_VARS="&key";
532  PK_VALS=catx('/',&commakey);
533  ERR_MSG="Rows cannot be modified as they do not exist on the Base dataset";
534  keep PK_VARS PK_VALS ERR_MSG;
535  run;
536  proc append base=&errds data=&moderr;
537  run;
538  /* delete the above records from the outmod table */
539  data &outmod;
540  merge &outmod (in=a) &modrec (in=b);
541  by &key;
542  if not b;
543  run;
544  /* now - we can prepare the final MOD table (which is currently PK only) */
545  proc sql undo_policy=none;
546  create table &outmod as
547  select a.key_hash
548  ,b.*
549  from &outmod a
550  inner join &base b
551  on &keyjoin
552  order by &commakey;
553  /* now - to update outmod with modified (is_diff=1) values */
554  %let fref=%mf_getuniquefileref();
555  data _null_;
556  file &fref;
557  set &auditlibds(where=(move_type='M')) end=lastobs;
558  by key_hash;
559  retain comma 'N';
560  if _n_=1 then put 'proc sql;';
561  if first.key_hash then do;
562  comma='N';
563  put "update &outmod set " @@;
564  end;
565  if is_diff=1 then do;
566  if comma='N' then do;
567  put ' '@@;
568  comma='Y';
569  end;
570  else put ' ,'@@;
571  if tgtvar_type='C' then do;
572  length qstr $32767;
573  qstr=quote(trim(NEWVAL_CHAR));
574  put tgtvar_nm '=' qstr;
575  end;
576  else put tgtvar_nm '=' newval_num;
577  if comma=' ' then comma=' ,';
578  end;
579  if last.key_hash then put ' where key_hash=trim("' key_hash '");';
580  if lastobs then put "alter table &outmod drop key_hash;";
581  run;
582  %inc &fref/source2;
583 %end;
584 
585 %if &mdebug=0 %then %do;
586  proc datasets lib=work;
587  delete &prefix:;
588  run;
589  %put &sysmacroname exit vars:;
590  %put _local_;
591 %end;
592 %mend mp_stackdiffs;
593 /** @endcond */