/* This program was developed by Westat, Inc. under NCI contract HHSN261201800002B*/ /* Version 1.0, copyright 2020 */ /* If you publish results based on these programs, please include the following citation: Dun and Bradstreet Measure Construction, Version 1.0 – December 2020; Division of Cancer Control and Population Sciences, National Cancer Institute. */ /* The program is distributed under the terms of the GNU General Public License: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details . */ /********************************************************************************************************** PROGRAM: #02_ImportDunBradstreet.sas OBJECTIVE: Measure Construction for Dun & Bradstreet data. This program follows the original STATA script in the appendix of the documentation for the Weight and Veterans' Environments Study. INPUT: RetailerNameLists.xlsx - Excel workbook with list of business names used in the original STATA script. The business names were originally coded directly into the STATA script; by storing the business names in Excel we can more easily add/delete business names as well improve readability of the SAS program. WESTAT_HISTORIC_OUTPUT_AUG2020.xlsx - Excel file with Colorado data provided by Dun & Bradstreet. dnb_geo.xlsx - Excel file with geocode information for each record in the Dun & Bradstreet file. OUTPUT: FF_Class.sas7bdat FFR_ForGrid.sas7bdat Note: Script will need to be adjusted with project-specific file names and locations **********************************************************************************************************/ options mprint mlogic; libname proj "FILEPATH"; %macro ImportXLSXListDB(src=,byvar=,list=,invar=); **** This macro is used to import the list of business names from a single worksheet from the RetailerNameLists.xlsx workbook. ; **** The business names in the Dun & Bradstreet worksheet are formatted to consider multiple matching conditions ; **** from the original STATA script. After import, the list of business is compared to the Dun & Bradstreet source data set and **** a variable, &invar, is created to represent whether the Dun & Bradstreet business name meets ; **** any of the matching conditions of the business name from the Excel sheet. ; *** Import a specified Excel worksheet of business names. ; proc import datafile="FILEPATH\RetailerNameLists.xlsx" out=rlist dbms=xlsx replace; sheet="&list."; run; *** Convert STATA syntax to SAS syntax. ; data rlist; length retailer $50; set rlist; retailer=translate(retailer,'%','*'); retailer=translate(retailer,'_','?'); obs=_N_; run; *** Build lines of code using the business names. ; %let wherestring1=; %let wherestring2=; proc sql noprint ; select 'bn like "' || left(trim(retailer)) || '" or tsn like "' || left(trim(retailer)) || '" ' into :wherestring1 separated by ' or ' from rlist where obs<=90; select 'or bn like "' || left(trim(retailer)) || '" or tsn like "' || left(trim(retailer)) || '" ' into :wherestring2 separated by ' ' from rlist where obs>90; quit; *** Starting with the Dun & Bradstreet source data, create a subset data set that contains records that ; *** company names that match the business names from the Excel list. ; proc sort data=&src.; by &byvar.; run; data listcond; set &src.(rename=(tradestylename=tsn businessname=bn)); where &wherestring1. &wherestring2. ; run; *** Merge the subset of matching business names back to the source Dun & Bradstreet data set.; *** Create a binary indicator to show whether or not a given company successfully matched with ; *** a business name from the Excel sheet.; data &src.; merge &src.(in=a) listcond(in=b keep=&byvar.); by &byvar.; if a; if b then &invar.=1; else &invar.=0; run; %mend ImportXLSXListDB; ****** Input files ; ****** Original file from Dun & Bradstreet ; %let input1 = FILEPATH\WESTAT_HISTORIC_OUTPUT_AUG2020.xlsx ; ****** File with geocode information ; %let input2 = FILEPATH\dnb_geo.xlsx ; proc import datafile="&input1." out=DunBradstreet(rename=(DUNS=dunsnumber)) dbms=xlsx replace; run; proc sort data=DunBradstreet; by dunsnumber; run; **** SIC code values can be longer than eight digits, so we will keep only first eight digits. ; data DunBradstreet(rename=(DTRADE=tradestylename DCOMP=businessname)); set DunBradstreet; primarysic = input(DPRIMSI || DSICEXT1, 8.); prim8digit=primarysic; if length(put(prim8digit,8.))=8 then only8=1; else only8=0; length prim8digits $8; prim8digits=DPRIMSI || DSICEXT1; run; data T7R1DB_; set DunBradstreet; run; *** Generate count of duplicates; proc freq data=DunBradstreet noprint; tables dunsnumber / out=outid; run; data outid; set outid; duplicateduns=count-1; run; data DunBradstreet; merge DunBradstreet(in=a) outid(keep=dunsnumber duplicateduns); by dunsnumber; run; *** Generate storetype variable values based on lists ; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 1, invar=in1); data DunBradstreet(drop=in1); set DunBradstreet; if in1 then storetype=1; run; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 2, invar=in2); data DunBradstreet(drop=in2); set DunBradstreet; if in2 then storetype=2; run; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 3, invar=in3); data DunBradstreet(drop=in3); set DunBradstreet; if in3 then storetype=3; run; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 4, invar=in4); data DunBradstreet(drop=in4); set DunBradstreet; if in4 then storetype=4; run; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 5, invar=in5); data DunBradstreet(drop=in5); set DunBradstreet; if in5 then storetype=5; run; *** Generate storetype variable values based on SIC code; data DunBradstreet; set DunBradstreet; if prim8digit=58120601 then storetype=6; else if prim8digit in (58120602,58120600) then storetype=7; else if prim8digit=58120304 then storetype=8; else if missing(storetype) then storetype=9; run; title2"Dun & Bradstreet file: distribution of storetype"; proc freq; tables storetype; run; title2; *** Drop convenience stores and full-service restaurants; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 6, invar=in6); data DunBradstreet(drop=in6); set DunBradstreet; if storetype>6 and in6 then conv=1; else conv=0; run; %ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 7, invar=in7); data DunBradstreet(drop=in7); set DunBradstreet; if storetype>6 and in7 then fullservice=1; else fullservice=0; run; data DunBradstreet; set DunBradstreet; by dunsnumber; if conv=1 then delete; if fullservice=1 then delete; run; data T7R2DB_; set DunBradstreet; run; title2"Dun & Bradstreet file: distribution of storetype after dropping Convenience and Full Service"; proc freq; tables storetype; run; title2; proc format; value FFRLabels 1="Top 5 Non-Sandwich, Non-Coffee Chains" 2="Subway" 3="Top Chains, Excluding Top 5, Coffeeshops, and Pizza" 4="Top Chain CoffeeShops" 5="Large Chain Pizza" 6="Chain Pizza, SIC: 58120601" 7="Non-Chain Pizza, SIC: 58120602 or 58120600" 8="Non-Chain CoffeeShops, SIC: 58120304" 9="FFR, Not Elsewhere Classified"; run; **** Add geocode information to the file. ; proc import datafile="&input2." out=dgeo(keep=duns Loc_name Match_addr X Y TRACT18) dbms=xlsx replace; run; data dgeo; set dgeo; length dunsnumber $9; dunsnumber=put(duns,Z9.); run; proc sort data=dgeo(drop=duns); by dunsnumber; run; data DunBradstreet; merge DunBradstreet(in=a) dgeo(in=b); by dunsnumber; if a; format storetype FFRLabels.; run; **** Delete rows who have insufficient geocode accuracy. ; data DunBradstreet; set DunBradstreet; length Loc_Name $12; if Loc_name = 'PointAddress' then Geocode_Level = '1-PointAddr'; else if Loc_name = 'StreetAddress' then Geocode_Level = '2-StreetAddr'; else if Loc_name = 'PostalExt' then Geocode_Level = '4-9digitZIP'; else if Loc_name = 'StreetName' then Geocode_Level = '5-StreetName'; else if Loc_name = 'Postal' then Geocode_Level = '6-5digitZIP'; else if Loc_name = 'AdminPlaces' then Geocode_Level = '7-AdminPlace'; else Geocode_Level = '9-' || Loc_name; length MatchCode $1; MatchCode = substr(Geocode_Level,1,1); if not (matchcode in ("1","2","3","4")) then delete; run; data T7R3DB_; set DunBradstreet; run; **** Delete Alaska & Hawaii rows and save Dun & Bradstreet file permanently. ; data proj.FF_Class; set DunBradstreet; if DSTATEAB in ("AK","HI") then delete; format _all_; run; **** Add count variable necessary for grids generation and save file permanently. ; data proj.FFR_ForGrid; set proj.FF_Class; count=1; run; title2"Count Summary Table Row 1: Counts"; proc print data=sashelp.vtable; var libname memname nobs; where libname="WORK" and substr(memname,1,7) eq "T7R1DB_"; run; title2; title2"Count Summary Table Row 2: Counts"; proc print data=sashelp.vtable; var libname memname nobs; where libname="WORK" and substr(memname,1,7) eq "T7R2DB_"; run; title2; title2"Count Summary Table Row 3: Counts"; proc print data=sashelp.vtable; var libname memname nobs; where libname="WORK" and substr(memname,1,7) eq "T7R3DB_"; run; title2; title2"Count Summary Table Row 6: Final counts"; proc print data=sashelp.vtable; var libname memname nobs; where libname="PROJ" and memname eq "FF_CLASS"; run; title2;