mp_guesspk.sas
Go to the documentation of this file.
1 /**
2  @file
3  @brief Guess the primary key of a table
4  @details Tries to guess the primary key of a table based on the following
5  logic:
6 
7  * Columns with nulls are ignored
8  * Return only column combinations that provide unique results
9  * Start from one column, then move out to composite keys of 2 to 6 columns
10 
11  The library of the target should be assigned before using this macro.
12 
13  Usage:
14 
15  filename mc url
16  "https://raw.githubusercontent.com/sasjs/core/main/all.sas";
17  %inc mc;
18  %mp_guesspk(sashelp.class,outds=classpks)
19 
20  @param [in] baseds The dataset to analyse
21  @param [out] outds= (mp_guesspk) Output dataset to contain the possible PKs
22  @param [in] max_guesses= (3) The total number of possible primary keys to
23  generate. A table may have multiple (unlikely) PKs, so no need to list them
24  all.
25  @param [in] min_rows= (5) The minimum number of rows a table should have in
26  order to try and guess the PK.
27  @param [in] ignore_cols (0) Space seperated list of columns which you are
28  sure are not part of the primary key (helps to avoid false positives)
29  @param [in] mdebug= Set to 1 to enable DEBUG messages and preserve outputs
30 
31  <h4> SAS Macros </h4>
32  @li mf_getvarlist.sas
33  @li mf_getuniquename.sas
34  @li mf_wordsInstr1butnotstr2.sas
35  @li mf_nobs.sas
36 
37  <h4> Related Macros </h4>
38  @li mp_getpk.sas
39 
40  @version 9.3
41  @author Allan Bowe
42 
43 **/
44 
45 %macro mp_guesspk(baseds
46  ,outds=mp_guesspk
47  ,max_guesses=3
48  ,min_rows=5
49  ,ignore_cols=0
50  ,mdebug=0
51 )/*/STORE SOURCE*/;
52 %local dbg;
53 %if &mdebug=1 %then %do;
54  %put &sysmacroname entry vars:;
55  %put _local_;
56 %end;
57 %else %let dbg=*;
58 
59 /* declare local vars */
60 %local var vars vcnt i j k l tmpvar tmpds rows posspks ppkcnt;
61 %let vars=%upcase(%mf_getvarlist(&baseds));
62 %let vars=%mf_wordsInStr1ButNotStr2(str1=&vars,str2=%upcase(&ignore_cols));
63 %let vcnt=%sysfunc(countw(&vars));
64 
65 %if &vcnt=0 %then %do;
66  %put &sysmacroname: &baseds has no variables! Exiting.;
67  %return;
68 %end;
69 
70 /* get null count and row count */
71 %let tmpvar=%mf_getuniquename();
72 proc sql noprint;
73 create table _data_ as select
74  count(*) as &tmpvar
75 %do i=1 %to &vcnt;
76  %let var=%scan(&vars,&i);
77  ,sum(case when &var is missing then 1 else 0 end) as &var
78 %end;
79  from &baseds;
80 
81 /* transpose table and scan for not null cols */
82 proc transpose;
83 data _null_;
84  set &syslast end=last;
85  length vars $32767;
86  retain vars ;
87  if _name_="&tmpvar" then call symputx('rows',col1,'l');
88  else if col1=0 then vars=catx(' ',vars,_name_);
89  if last then call symputx('posspks',vars,'l');
90 run;
91 
92 %let ppkcnt=%sysfunc(countw(&posspks));
93 %if &ppkcnt=0 %then %do;
94  %put &sysmacroname: &baseds has no non-missing variables! Exiting.;
95  %return;
96 %end;
97 
98 proc sort data=&baseds(keep=&posspks) out=_data_ noduprec;
99  by _all_;
100 run;
101 %local pkds; %let pkds=&syslast;
102 
103 %if &rows > %mf_nobs(&pkds) %then %do;
104  %put &sysmacroname: &baseds has no combination of unique records! Exiting.;
105  %return;
106 %end;
107 
108 /* now check cardinality */
109 proc sql noprint;
110 create table _data_ as select
111 %do i=1 %to &ppkcnt;
112  %let var=%scan(&posspks,&i);
113  count(distinct &var) as &var
114  %if &i<&ppkcnt %then ,;
115 %end;
116  from &pkds;
117 
118 /* transpose and sort by cardinality */
119 proc transpose;
120 proc sort; by descending col1;
121 run;
122 
123 /* create initial PK list and re-order posspks list */
124 data &outds(keep=pkguesses);
125  length pkguesses $5000 vars $5000;
126  set &syslast end=last;
127  retain vars ;
128  vars=catx(' ',vars,_name_);
129  if col1=&rows then do;
130  pkguesses=_name_;
131  output;
132  end;
133  if last then call symputx('posspks',vars,'l');
134 run;
135 
136 %if %mf_nobs(&outds) ge &max_guesses %then %do;
137  %put &sysmacroname: %mf_nobs(&outds) possible primary key values found;
138  %return;
139 %end;
140 
141 %if &ppkcnt=1 %then %do;
142  %put &sysmacroname: No more PK guess possible;
143  %return;
144 %end;
145 
146 /* begin scanning for uniques on pairs of PKs */
147 %let tmpds=%mf_getuniquename();
148 %local lev1 lev2;
149 %do i=1 %to &ppkcnt;
150  %let lev1=%scan(&posspks,&i);
151  %do j=2 %to &ppkcnt;
152  %let lev2=%scan(&posspks,&j);
153  %if &lev1 ne &lev2 %then %do;
154  /* check for two level uniqueness */
155  proc sort data=&pkds(keep=&lev1 &lev2) out=&tmpds noduprec;
156  by _all_;
157  run;
158  %if %mf_nobs(&tmpds)=&rows %then %do;
159  proc sql;
160  insert into &outds values("&lev1 &lev2");
161  %if %mf_nobs(&outds) ge &max_guesses %then %do;
162  %put &sysmacroname: Max PKs reached at Level 2 for &baseds;
163  %goto exit;
164  %end;
165  %end;
166  %end;
167  %end;
168 %end;
169 
170 %if &ppkcnt=2 %then %do;
171  %put &sysmacroname: No more PK guess possible;
172  %goto exit;
173 %end;
174 
175 /* begin scanning for uniques on PK triplets */
176 %local lev3;
177 %do i=1 %to &ppkcnt;
178  %let lev1=%scan(&posspks,&i);
179  %do j=2 %to &ppkcnt;
180  %let lev2=%scan(&posspks,&j);
181  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
182  %let lev3=%scan(&posspks,&k);
183  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do;
184  /* check for three level uniqueness */
185  proc sort data=&pkds(keep=&lev1 &lev2 &lev3) out=&tmpds noduprec;
186  by _all_;
187  run;
188  %if %mf_nobs(&tmpds)=&rows %then %do;
189  proc sql;
190  insert into &outds values("&lev1 &lev2 &lev3");
191  %if %mf_nobs(&outds) ge &max_guesses %then %do;
192  %put &sysmacroname: Max PKs reached at Level 3 for &baseds;
193  %goto exit;
194  %end;
195  %end;
196  %end;
197  %end;
198  %end;
199 %end;
200 
201 %if &ppkcnt=3 %then %do;
202  %put &sysmacroname: No more PK guess possible;
203  %goto exit;
204 %end;
205 
206 /* scan for uniques on up to 4 PK fields */
207 %local lev4;
208 %do i=1 %to &ppkcnt;
209  %let lev1=%scan(&posspks,&i);
210  %do j=2 %to &ppkcnt;
211  %let lev2=%scan(&posspks,&j);
212  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
213  %let lev3=%scan(&posspks,&k);
214  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
215  %let lev4=%scan(&posspks,&l);
216  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then %do;
217  /* check for four level uniqueness */
218  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4)
219  out=&tmpds noduprec;
220  by _all_;
221  run;
222  %if %mf_nobs(&tmpds)=&rows %then %do;
223  proc sql;
224  insert into &outds values("&lev1 &lev2 &lev3 &lev4");
225  %if %mf_nobs(&outds) ge &max_guesses %then %do;
226  %put &sysmacroname: Max PKs reached at Level 4 for &baseds;
227  %goto exit;
228  %end;
229  %end;
230  %end;
231  %end;
232  %end;
233  %end;
234 %end;
235 
236 %if &ppkcnt=4 %then %do;
237  %put &sysmacroname: No more PK guess possible;
238  %goto exit;
239 %end;
240 
241 /* scan for uniques on up to 4 PK fields */
242 %local lev5 m;
243 %do i=1 %to &ppkcnt;
244  %let lev1=%scan(&posspks,&i);
245  %do j=2 %to &ppkcnt;
246  %let lev2=%scan(&posspks,&j);
247  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
248  %let lev3=%scan(&posspks,&k);
249  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
250  %let lev4=%scan(&posspks,&l);
251  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
252  %do m=5 %to &ppkcnt;
253  %let lev5=%scan(&posspks,&m);
254  %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then %do;
255  /* check for four level uniqueness */
256  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5)
257  out=&tmpds noduprec;
258  by _all_;
259  run;
260  %if %mf_nobs(&tmpds)=&rows %then %do;
261  proc sql;
262  insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5");
263  %if %mf_nobs(&outds) ge &max_guesses %then %do;
264  %put &sysmacroname: Max PKs reached at Level 5 for &baseds;
265  %goto exit;
266  %end;
267  %end;
268  %end;
269  %end;
270  %end;
271  %end;
272  %end;
273 %end;
274 
275 %if &ppkcnt=5 %then %do;
276  %put &sysmacroname: No more PK guess possible;
277  %goto exit;
278 %end;
279 
280 /* scan for uniques on up to 4 PK fields */
281 %local lev6 n;
282 %do i=1 %to &ppkcnt;
283  %let lev1=%scan(&posspks,&i);
284  %do j=2 %to &ppkcnt;
285  %let lev2=%scan(&posspks,&j);
286  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
287  %let lev3=%scan(&posspks,&k);
288  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
289  %let lev4=%scan(&posspks,&l);
290  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
291  %do m=5 %to &ppkcnt;
292  %let lev5=%scan(&posspks,&m);
293  %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5
294  %then %do n=6 %to &ppkcnt;
295  %let lev6=%scan(&posspks,&n);
296  %if &lev1 ne &lev6 & &lev2 ne &lev6 & &lev3 ne &lev6
297  & &lev4 ne &lev6 & &lev5 ne &lev6 %then
298  %do;
299  /* check for four level uniqueness */
300  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5 &lev6)
301  out=&tmpds noduprec;
302  by _all_;
303  run;
304  %if %mf_nobs(&tmpds)=&rows %then %do;
305  proc sql;
306  insert into &outds
307  values("&lev1 &lev2 &lev3 &lev4 &lev5 &lev6");
308  %if %mf_nobs(&outds) ge &max_guesses %then %do;
309  %put &sysmacroname: Max PKs reached at Level 6 for &baseds;
310  %goto exit;
311  %end;
312  %end;
313  %end;
314  %end;
315  %end;
316  %end;
317  %end;
318  %end;
319 %end;
320 
321 %if &ppkcnt=6 %then %do;
322  %put &sysmacroname: No more PK guess possible;
323  %goto exit;
324 %end;
325 
326 %exit:
327 %if &mdebug=0 %then %do;
328  proc sql;
329  drop table &tmpds;
330 %end;
331 
332 %mend mp_guesspk;