mp_hashdirectory.sas
Go to the documentation of this file.
1 /**
2  @file
3  @brief Returns a unique hash for each file in a directory
4  @details Hashes each file in each directory, and then hashes the hashes to
5  create a hash for each directory also.
6 
7  This makes use of the new `hashing_file()` and `hashing` functions, available
8  since 9.4m6. Interestingly, those functions can be used in pure macro, eg:
9 
10  %put %sysfunc(hashing_file(md5,/path/to/file.blob,0));
11 
12  Actual usage:
13 
14  %let fpath=/some/directory;
15 
16  %mp_hashdirectory(&fpath,outds=myhash,maxdepth=2)
17 
18  data _null_;
19  set work.myhash;
20  put (_all_)(=);
21  run;
22 
23  Whilst files are hashed in their entirety, the logic for creating a folder
24  hash is as follows:
25 
26  @li Sort the files by filename (case sensitive, uppercase then lower)
27  @li Take the first 100 hashes, concatenate and hash
28  @li Concatenate this hash with another 100 hashes and hash again
29  @li Continue until the end of the folder. This is the folder hash
30  @li If a folder contains other folders, start from the bottom of the tree -
31  the folder hashes cascade upwards so you know immediately if there is a
32  change in a sub/sub directory
33  @li If a subfolder has no content (empty) then it is ignored. No hash created.
34  @li If the file is empty, it is also ignored / no hash created.
35  @li If the target directory (&inloc) is empty, &outds will also be empty
36 
37  <h4> SAS Macros </h4>
38  @li mp_dirlist.sas
39 
40  <h4> Related Files </h4>
41  @li mp_hashdataset.sas
42  @li mp_hashdirectory.test.sas
43  @li mp_md5.sas
44 
45  @param [in] inloc Full filepath of the file to be hashed (unquoted)
46  @param [in] iftrue= (1=1) A condition under which the macro should be executed
47  @param [in] maxdepth= (0) Set to a positive integer to indicate the level of
48  subdirectory scan recursion - eg 3, to go `./3/levels/deep`. For unlimited
49  recursion, set to MAX.
50  @param [in] method= (MD5) the hashing method to use. Available options:
51  @li MD5
52  @li SH1
53  @li SHA256
54  @li SHA384
55  @li SHA512
56  @li CRC32
57  @param [out] outds= (work.mp_hashdirectory) The output dataset. Contains:
58  @li directory - the parent folder
59  @li file_hash - the hash output
60  @li hash_duration - how long the hash took (first hash always takes longer)
61  @li file_path - /full/path/to/each/file.ext
62  @li file_or_folder - contains either "file" or "folder"
63  @li level - the depth of the directory (top level is 0)
64 
65  @version 9.4m6
66  @author Allan Bowe
67 **/
68 
69 %macro mp_hashdirectory(inloc,
70  outds=work.mp_hashdirectory,
71  method=MD5,
72  maxdepth=0,
73  iftrue=%str(1=1)
74 )/*/STORE SOURCE*/;
75 
76 %local curlevel tempds maxlevel;
77 
78 %if not(%eval(%unquote(&iftrue))) %then %return;
79 
80 /* get the directory listing */
81 %mp_dirlist(path=&inloc, outds=&outds, maxdepth=&maxdepth, showparent=YES)
82 
83 /* create the hashes */
84 data &outds;
85  set &outds (rename=(filepath=file_path));
86  length FILE_HASH $32 HASH_DURATION 8;
87  keep directory file_hash hash_duration file_path file_or_folder level;
88 
89  ts=datetime();
90  if file_or_folder='file' then do;
91  /* if file is empty, hashing_file will break - so ignore / delete */
92  length fname val $8;
93  drop fname val fid is_empty;
94  rc=filename(fname,file_path);
95  fid=fopen(fname);
96  if fid > 0 then do;
97  rc=fread(fid);
98  is_empty=fget(fid,val);
99  end;
100  rc=fclose(fid);
101  rc=filename(fname);
102  if is_empty ne 0 then delete;
103  else file_hash=hashing_file("&method",cats(file_path),0);
104  end;
105  hash_duration=datetime()-ts;
106 run;
107 
108 proc sort data=&outds ;
109  by descending level directory file_path;
110 run;
111 
112 %let maxlevel=0;
113 data _null_;
114  set &outds;
115  call symputx('maxlevel',level,'l');
116  stop;
117 run;
118 
119 /* now hash the hashes to populate folder hashes, starting from the bottom */
120 %do curlevel=&maxlevel %to 0 %by -1;
121  data work._data_ (keep=directory file_hash);
122  set &outds;
123  where level=&curlevel;
124  by descending level directory file_path;
125  length str $32767 tmp_hash $32;
126  retain str tmp_hash ;
127  /* reset vars when starting a new directory */
128  if first.directory then do;
129  str='';
130  tmp_hash='';
131  i=0;
132  end;
133  /* hash each chunk of 100 file paths */
134  i+1;
135  str=cats(str,file_hash);
136  if mod(i,100)=0 or last.directory then do;
137  tmp_hash=hashing("&method",cats(tmp_hash,str));
138  str='';
139  end;
140  /* output the hash at directory level */
141  if last.directory then do;
142  file_hash=tmp_hash;
143  output;
144  end;
145  if last.level then stop;
146  run;
147  %let tempds=&syslast;
148  /* join the hash back into the main table */
149  proc sql undo_policy=none;
150  create table &outds as
151  select a.directory
152  ,coalesce(b.file_hash,a.file_hash) as file_hash
153  ,a.hash_duration
154  ,a.file_path
155  ,a.file_or_folder
156  ,a.level
157  from &outds a
158  left join &tempds b
159  on a.file_path=b.directory
160  order by level desc, directory, file_path;
161  drop table &tempds;
162 %end;
163 
164 %mend mp_hashdirectory;