/* This program was developed by Westat, Inc. under NCI contract HHSN261201800002B*/ /* Version 1.0, copyright 2020 */ /* If you publish results based on these programs, please include the following citation: InfoUSA Measure Construction, Version 1.0 – December 2020; Division of Cancer Control and Population Sciences, National Cancer Institute. */ /* The program is distributed under the terms of the GNU General Public License: This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details . */ /********************************************************************************************************** PROGRAM: #01_ImportInfoUSA.sas OBJECTIVE: Measure Construction for InfoUSA data. This program follows the original STATA script in the appendix of the documentation for the Weight and Veterans' Environments Study. INPUT: RetailerNameLists.xlsx - Excel workbook with list of business names used in the original STATA script. The business names were originally coded directly into the STATA script; by storing the business names in Excel we can more easily add/delete business names as well improve readability of the SAS program. Order_1082862.xlsx - Excel file with Colorado data provided by InfoUSA. info_geo.xlsx - Excel file with geocode information for each record in the InfoUSA file. OUTPUT: GeneralMerchandise.sas7bdat ConvenienceStores.sas7bdat LiquorStores.sas7bdat Pharmacies.sas7bdat SupermarketsGrocery.sas7bdat Unclassified.sas7bdat Note: Script will need to be adjusted with project-specific file names and locations **********************************************************************************************************/ options mprint mlogic; libname proj "FILEPATH"; %macro ImportXLSXListIUSA(src=,byvar=,list=,invar=); **** This macro is used to import the list of business names from a single worksheet from the RetailerNameLists.xlsx workbook. ; **** The business names in the InfoUSA worksheet are formatted to consider multiple matching conditions ; **** from the original STATA script. After import, the list of business is compared to the InfoUSA source data set and **** a variable, &invar, is created to represent whether the InfoUSA business name meets ; **** any of the matching conditions of the business name from the Excel sheet. ; *** Import a specified Excel worksheet of business names. ; proc import datafile="FILEPATH\RetailerNameLists.xlsx" out=rlist dbms=xlsx replace; sheet="&list."; run; *** Convert STATA syntax to SAS syntax. ; data rlist; length retailer $50; set rlist; retailer=translate(retailer,'%','*'); retailer=translate(retailer,'_','?'); obs=_N_; run; *** Build lines of code using the business names. ; %let wherestring1=; %let wherestring2=; proc sql noprint ; select 'cn like "' || left(trim(retailer)) || '" ' into :wherestring1 separated by ' or ' from rlist where obs<=180; select 'or cn like "' || left(trim(retailer)) || '" ' into :wherestring2 separated by ' ' from rlist where obs>180; quit; *** Starting with the InfoUSA source data, create a subset data set that contains records that ; *** company names that match the business names from the Excel list. ; proc sort data=&src.; by &byvar.; run; data listcond; set &src.(rename=(companyname=cn)); where &wherestring1. &wherestring2. ; run; *** Merge the subset of matching business names back to the source InfoUSA data set.; *** Create a binary indicator to show whether or not a given company successfully matched with ; *** a business name from the Excel sheet.; data &src.; merge &src.(in=a) listcond(in=b keep=&byvar.); by &byvar.; if a; if b then &invar.=1; else &invar.=0; run; %mend ImportXLSXListIUSA; ****** Input files ; ****** Original file from InfoUSA ; %let input1 = FILEPATH\Order_1082862.xlsx ; ****** File with geocode information ; %let input2 = FILEPATH\info_geo.xlsx ; proc import datafile="&input1." out=InfoUSA dbms=xlsx replace; run; **** Reduce InfoUSA data files to only necessary variables for making measures.; data InfoUSA; set InfoUSA(rename=( VAR17=EmpSize5Location VAR18=EmpSize6Corporate VAR23=SalesVol5Location VAR24=SalesVol6Corporate SIC6_Descriptions__PRIMARYSIC_=PrimarySicDescription)); rename Company=companyname address_line_1=address city=cityname state=statealpha zipcode=digitzipcode Primary_SIC_Code=PrimarySic ; drop Landmark_: Mailing_: Unit_: Gender Call_Status_Code Subsidiary_Number--Census_Block NAICS_Code Primary_NAICS_Code NAICS8_Descriptions SIC_: SIC6: ; actualsalesvolume=input(Parent_Actual_Sales_Volume,9.); run; **** Check for duplicates; proc sql noprint; create table InfoUSADuplicates as select companyname,address,cityname,statealpha,digitzipcode, count(*) as nObs from InfoUSA group by companyname,address,cityname,statealpha,digitzipcode having count(*)>1; quit; title2"QC1: Duplicates in InfoUSA file based on "; title3"companyname,address,cityname,statealpha,digitzipcode"; proc print data=InfoUSADuplicates; var companyname address cityname statealpha digitzipcode nObs; run; title3; title2; proc sort data=InfoUSA out=qc nodupkey; by ABI; run; **** This portion of the file separates different store types for measure construction ; data InfoUSA; set InfoUSA; companyname=upcase(companyname); *** Count field value 1 required to make density maps; count=1; *** Liquor Stores ; liquor = 0; if primarysic in ("592102" "592103" "592104") then liquor = 1; run; *** General Merchandise; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 1, invar=in1); data InfoUSA(drop=in1); set InfoUSA; if in1 and not(companyname in ("WAL-MART NEIGHBORHOOD MARKET","WALMART NEIGHBORHOOD MARKET","COSCOB SPORTS MEDICINE")) then GenMerch=1; else GenMerch=0; run; *** Pharmacies. Initially set pharmacy type based on SIC code. ; data InfoUSA; set InfoUSA; sic4=substr(PrimarySic,1,4); if sic4="5912" then pharm=1; else pharm=0; run; *** Set pharmacy type to 0 if it is part of a general merchandise store.; *** We will set pharmacy type to 0 for grocery stores in a later step.; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 2, invar=in2); data InfoUSA(drop=in2); set InfoUSA; if in2 and genmerch=1 and pharm=1 then pharm=0; run; *** The next section creates duppharmgenmerch and duplatlonggenmerch, which are indicators of duplicates.; proc freq data=InfoUSA noprint; tables address * cityname * statealpha * zip4 / out=outd1; tables latitude * longitude / out=outd2; where genmerch=1; run; data outd1; set outd1; if count>1; run; data outd2; set outd2; if count>1; run; proc sort data=InfoUSA(drop=count); by address cityname statealpha zip4; run; data InfoUSA(drop=count); merge InfoUSA(in=a) outd1(in=b keep=address cityname statealpha zip4 count); by address cityname statealpha zip4; if a; if b then duppharmgenmerch=count-1; run; proc sort data=InfoUSA; by latitude longitude; run; data InfoUSA(drop=count); merge InfoUSA(in=a) outd2(in=b keep=latitude longitude count); by latitude longitude; if a; if b then duplatlonggenmerch=count-1; run; *** Convenience stores. Initially set convenience store type based on SIC code. ; data InfoUSA; set InfoUSA; if primarysic in ("544103","554101","554103") then conv=1; else conv=0; run; *** We will check to see if the convenience stores are actually pharmacies and ; *** unset the flag if found in a list of pharmacies. ; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 3, invar=in3); data InfoUSA(drop=in3); set InfoUSA; if in3 and conv=1 then conv=0; run; *** Set pharmacy flag for every pharmacy found under convenience stores.; data InfoUSA; set InfoUSA; if conv=0 and primarysic in ("541103" "554101" "554103") then pharm=1; run; *** The next section checks is any of the convenience stores are in a list of ; *** chain supermarkets or general merchandise. ; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 4, invar=in4); data listcond4alt; set InfoUSA; where companyname like 'SHOP%SAVE'; run; data InfoUSA; merge InfoUSA(in=a) listcond4alt(in=b keep=abi); by abi; if a; if b then in4=1; run; data InfoUSA(drop=in4); set InfoUSA; if in4 and conv=1 then conv=0; if companyname = "BIALOWIESKI MARKET" then conv=1; if companyname = "BILO GAS N GO" then conv=1; run; *** Supermarkets and grocery stores. Initially set supgroc indicator based on SIC code. ; *** But unset supgroc indicator if it is already a general merchandise store.; data InfoUSA; set InfoUSA; if primarysic in ("541101","541102","541104","541105","541106","541107","541108","541109") then supgroc=1; if genmerch=1 then supgroc=0; run; *** Unset supgroc indicator if it is in a list of pharmacies; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 5, invar=in5); data InfoUSA(drop=in5); set InfoUSA; if in5 and supgroc=1 then pharm=1; if pharm=1 then supgroc=0; run; *** Unset supgroc indicator if it is in a list of convenience stores.; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 6, invar=in6); data InfoUSA(drop=in6); set InfoUSA; if in6 and supgroc=1 then conv=1; if conv=1 then supgroc=0; *** Create a whole_sale indicator; whole_sale=0; if companyname="BOZZUTO'S" then whole_sale=1; run; *********************************************; ****CHAINS: AFTER CLASSIFICATION, pull chains; *********************************************; data InfoUSA ; set InfoUSA ; ***Supermarkets/Grocery Stores; ***Flag stores with > $2 million in annual sales. This will only be useful for Supermarkets/Grocery Stores; twomil = 0; if actualsalesvolume>1999 & actualsalesvolume ne . then twomil=1; ***Create Regional 11+ designator - shows stores that have 11 or more stores in the region = FMI definition of Supermarket; region=0; if statealpha="CT" or statealpha="ME" or statealpha="MA" or statealpha="NH" or statealpha="RI" or statealpha="VT" then region=1; if statealpha="NJ" or statealpha="NY" or statealpha="PA" then region=2; if statealpha="IN" or statealpha="IL" or statealpha="MI" or statealpha="OH" or statealpha="WI" then region=3; if statealpha="IA" or statealpha="KS" or statealpha="MN" or statealpha="MO" or statealpha="NE" or statealpha="ND" or statealpha="SD" then region=4; if statealpha="DE" or statealpha="DC" or statealpha="FL" or statealpha="GA" or statealpha="MD" or statealpha="NC" or statealpha="SC" or statealpha="VA" or statealpha="WV" then region=5; if statealpha="AL" or statealpha="KY" or statealpha="MS" or statealpha="TN" then region=6; if statealpha="AR" or statealpha="LA" or statealpha="OK" or statealpha="TX" then region=7; if statealpha="AZ" or statealpha="CO" or statealpha="ID" or statealpha="NM" or statealpha="MT" or statealpha="UT" or statealpha="NV" or statealpha="WY" then region=8; if statealpha="AK" or statealpha="CA" or statealpha="HI" or statealpha="OR" or statealpha="WA" then region=9; run; proc freq data=InfoUSA noprint; tables CompanyName * region / out=outcr; run; data outcr; set outcr; if count>=11; run; proc sort data=InfoUSA; by companyname region; run; data InfoUSA; merge InfoUSA(in=a) outcr(in=b keep=companyname region); by companyname region; if a; if b then elevenregion=1; else elevenregion=0; run; proc sort data=InfoUSA; by abi; run; *** Flag stores that are chain stores. ; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 7, invar=in7); data listcond7alt; set InfoUSA; where companyname like 'SHOP%SAVE'; run; data InfoUSA; merge InfoUSA(in=a) listcond7alt(in=b keep=abi); by abi; if a; if b then in7=1; run; data InfoUSA(drop=in7); set InfoUSA; sgnamechain=0; if in7 and supgroc=1 then sgnamechain=1; if indexw(companyname,"IGA") and supgroc=1 then sgnamechain=1; run; *** Convenience stores. ; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 8, invar=in8); data InfoUSA(drop=in8); set InfoUSA; convnamechain=0; if in8 and conv=1 then convnamechain=1; run; *** Pharmacies. ; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 9, invar=in9); data InfoUSA(drop=in9 untruepull); set InfoUSA; phnamechain=0; if in9 and pharm=1 then phnamechain=1; *** This variable culls those that should not have been pulled due to the name list. ; *** What that means is that we asked for SIC code 591205, and then for a name search ; *** to be performed for the above list of names on SIC4 5912. So if there is a record ; *** that has a SIC4 of 5912 but does not have a name on this list, it was not pulled as a pharmacy.; if phnamechain=0 & primarysic ne "591205" then untruepull=1; if untruepull=1 then pharm=0; run; *** Generate a flag genmerchmem for specific general merchandise. ; data genmerchmem; set InfoUSA; where genmerch=1 and ( companyname like "COSCO%" or companyname like "COSTCO%" or companyname like "SAM_S_CLUB" or companyname like "BJ'S_WHOLESALE%" or companyname like "BJS_WHOLESALE%" ); run; data InfoUSA; merge InfoUSA(in=a) genmerchmem(in=b keep=abi); by abi; if a; if b then genmerchmem=1; run; *** Supermarkets are "typed" at this point, so that if in classifying other store types something was turned into a supermarket, it still gets a type. ; proc format; value classif 1="<$2 Mill, No Chain Name, <11 Stores" 2="<$2 Mill, No Chain Name, 11+ Stores" 3="<$2 Mill, Chain Name, <11 Stores" 4="<$2 Mill, Chain Name, 11+ Stores" 5="$2+ Mill, No Chain Name, <11 Stores" 6="$2+ Mill, No Chain Name, 11+ Stores" 7="$2+ Mill, Chain Name, <11 Stores" 8="$2+ Mill, Chain Name, 11+ Stores"; run; data InfoUSA; set InfoUSA; type_reg=0; if twomil=0 & sgnamechain=0 & elevenregion=0 then type_reg=1; if twomil=0 & sgnamechain=0 & elevenregion=1 then type_reg=2; if twomil=0 & sgnamechain=1 & elevenregion=0 then type_reg=3; if twomil=0 & sgnamechain=1 & elevenregion=1 then type_reg=4; if twomil=1 & sgnamechain=0 & elevenregion=0 then type_reg=5; if twomil=1 & sgnamechain=0 & elevenregion=1 then type_reg=6; if twomil=1 & sgnamechain=1 & elevenregion=0 then type_reg=7; if twomil=1 & sgnamechain=1 & elevenregion=1 then type_reg=8; format type_reg classif.; run; *** It was found that many records had PO Boxes in the address fields.; *** Flag PO Boxes by Store Type ; data pobox; set InfoUSA; where address like "PO BOX%" or address like "%PO BOX%" or address like "POST OFFICE BOX%" or address like "RR BOX%" or address like "%RR BOX%" or address like "RURAL ROUTE BOX%" or address like "HC___BOX%" or address like "HC____BOX%" or address like "HC_____BOX%" or strip(address) like "HCBOX%" ; run; data InfoUSA; merge InfoUSA(in=a) pobox(in=b keep=abi); by abi; if a; pobox=0; if b then pobox=1; run; title2"Total Raw Count reports - by PO Box"; title3"Overall"; proc freq data=InfoUSA; tables pobox ; run; title3"Supermarkets/Grocery"; proc freq data=InfoUSA; tables pobox ; where supgroc=1; run; title3"Convenience Stores"; proc freq data=InfoUSA; tables pobox ; where conv=1; run; title3"Pharmacies"; proc freq data=InfoUSA; tables pobox ; where pharm=1; run; title3"Liquor Stores"; proc freq data=InfoUSA; tables pobox ; where liquor=1; run; title3"General Merchandise"; proc freq data=InfoUSA; tables pobox ; where genmerch=1; run; title3; title2; data T7R1_GeneralMerchandise T7R1_Liquor T7R1_Pharmacy T7R1_Convenience T7R1_SupermarketGrocery; set InfoUSA; if genmerch=1 then output T7R1_GeneralMerchandise; if liquor=1 then output T7R1_Liquor; if pharm=1 then output T7R1_Pharmacy; if conv=1 then output T7R1_Convenience; if supgroc=1 then output T7R1_SupermarketGrocery; run; title2"Housekeeping reports: based on address cityname statealpha zip4"; title3"Duplicate Supermarkets"; proc sql; select address,cityname,statealpha,zip4, count(*) as n from InfoUSA where supgroc=1 group by address,cityname,statealpha,zip4 having count(*)>1; quit; title3; title3"Duplicate General Merchandise/Grocery"; proc sql; select address,cityname,statealpha,zip4, count(*) as n from InfoUSA where genmerch=1 group by address,cityname,statealpha,zip4 having count(*)>1; quit; title3; title3"Duplicate Convenience Stores"; proc sql; select address,cityname,statealpha,zip4, count(*) as n from InfoUSA where conv=1 group by address,cityname,statealpha,zip4 having count(*)>1; quit; title3; title3"Duplicate Pharmacies"; proc sql; select address,cityname,statealpha,zip4, count(*) as n from InfoUSA where pharm=1 group by address,cityname,statealpha,zip4 having count(*)>1; quit; title3; title3"Duplicate Liquor"; proc sql; select address,cityname,statealpha,zip4, count(*) as n from InfoUSA where liquor=1 group by address,cityname,statealpha,zip4 having count(*)>1; quit; title3; title2; title2 "Cross-Classifications between General Merchandise and Pharmacy"; proc print data=InfoUSA; where genmerch=1 & pharm=1; run; title2; title2 "Cross-Classifications between General Merchandise and Liquor"; proc print data=InfoUSA; where genmerch=1 & liquor=1; run; title2; title2 "Cross-Classifications between General Merchandise and Convenience"; proc print data=InfoUSA; where genmerch=1 & conv=1; run; title2; title2 "Cross-Classifications between General Merchandise and Supermarkets/Groceries"; proc print data=InfoUSA; where genmerch=1 & supgroc=1; run; title2; title2"Stores Cross-Classified as General Merchandise and Pharmacy will (quietly) be made General Merchandise"; proc print data=InfoUSA; var companyname; where pharm=1 and genmerch=1; run; title2; %ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 10, invar=in10); data InfoUSA(drop=in10); set InfoUSA; if pharm=1 and genmerch=1 and in10 then pharm=0; run; title2"Stores Cross-Classified as General Merchandise and Liquor will (quietly) be made General Merchandise"; data cc2; set InfoUSA; where liquor=1 and genmerch=1 and (companyname like 'COSCO%' or companyname like 'COSTCO%'); run; proc print data=cc2; var companyname; where liquor=1 and genmerch=1; run; title2; data InfoUSA; merge InfoUSA(in=a) cc2(in=b keep=ABI); by ABI; if a; if b then liquor=0; run; title2; title2 "Cross-Classifications between Pharmacies and Liquor"; proc print data=InfoUSA n; var companyname; where pharm=1 & liquor=1; run; title2; title2 "Cross-Classifications between Pharmacies and Convenience"; proc print data=InfoUSA n; var companyname; where pharm=1 & conv=1; run; title2; title2 "Cross-Classifications between Pharmacies and Supermarkets/Groceries"; proc print data=InfoUSA n; var companyname; where pharm=1 & supgroc=1; run; title2; title2 "Cross-Classifications between Convenience and Liquor"; proc print data=InfoUSA n; var companyname; where conv=1 & liquor=1; run; title2; title2 "Cross-Classifications between Convenience and Supermarkets/Groceries"; proc print data=InfoUSA n; var companyname; where conv=1 & supgroc=1; run; title2; title2 "Cross-Classifications between Liquor and Supermarkets/Groceries"; proc print data=InfoUSA n; var companyname; where liquor=1 & supgroc=1; run; title2; *** What are we not classifying? There are many not classified at this point, including all the PAFacilities. ; proc format; value type 0="not classified" 1="general merchandise" 2="liquor" 3="pharmacy" 4="convenient store" 5="grocery/super market"; run; data InfoUSA; set InfoUSA; typestore=0; if genmerch=1 then typestore=1; if liquor=1 then typestore=2; if pharm=1 then typestore=3; if conv=1 then typestore=4; if supgroc=1 then typestore=5; label typestore="Macro store type"; format typestore type.; run; title2"all stores by type"; proc freq data=InfoUSA; tables typestore; run; *** In this section, individual store type files are made. This is important because deduplication is done within store types, ; *** so they have to be separated out, first. ; title2"Supermarket/Grocery Stores"; data SupermarketsGrocery; set InfoUSA; if supgroc=1; run; proc freq; tables primarysic primarysicdescription; run; title2"Pharmacies"; data Pharmacies; set InfoUSA; if pharm=1; run; proc freq; tables primarysic primarysicdescription; run; title2"Convenience Stores"; data ConvenienceStores; set InfoUSA; if conv=1; run; proc freq; tables primarysic primarysicdescription; run; title2"Liquor Stores"; data LiquorStores; set InfoUSA; if Liquor=1; run; proc freq; tables primarysic primarysicdescription; run; title2"General Merchandise Stores"; data GeneralMerchandise; set InfoUSA; if genmerch=1; run; proc freq; tables primarysic primarysicdescription; run; title2"PA Facilites/Unclassified"; data Unclassified; set InfoUSA; if typestore=0; run; proc freq; tables primarysic primarysicdescription; run; title2; *** In the original script, PO Box Geocodes were replaced with closest address Geocodes. ; *** For this script, we will add on closest address Geocode information for all records, not just PO Boxes. ; proc import datafile="&input2." out=igeo(keep=abi Loc_name Match_addr X Y dist_mi dist_m TRACT18 rename=(abi=abin)) dbms=xlsx replace; run; data igeo(drop=abin); set igeo; length abi $9; abi=put(abin,9.); length Loc_Name $12; if Loc_name = 'PointAddress' then Geocode_Level = '1-PointAddr'; else if Loc_name = 'StreetAddress' then Geocode_Level = '2-StreetAddr'; else if Loc_name = 'PostalExt' then Geocode_Level = '4-9digitZIP'; else if Loc_name = 'StreetName' then Geocode_Level = '5-StreetName'; else if Loc_name = 'Postal' then Geocode_Level = '6-5digitZIP'; else if Loc_name = 'AdminPlaces' then Geocode_Level = '7-AdminPlace'; else Geocode_Level = '9-' || Loc_name; length MatchCode $1; MatchCode = substr(Geocode_Level,1,1); run; proc sort data=igeo; by abi; run; data SupermarketsGrocery T7R2_SupermarketsGrocery; merge SupermarketsGrocery(in=a) igeo; by abi; if a; output SupermarketsGrocery; output T7R2_SupermarketsGrocery; run; data Pharmacies T7R2_Pharmacies; merge Pharmacies(in=a) igeo; by abi; if a; output Pharmacies; output T7R2_Pharmacies; run; data ConvenienceStores T7R2_ConvenienceStores; merge ConvenienceStores(in=a) igeo; by abi; if a; output ConvenienceStores; output T7R2_ConvenienceStores; run; data LiquorStores T7R2_LiquorStores; merge LiquorStores(in=a) igeo; by abi; if a; output LiquorStores; output T7R2_LiquorStores; run; data GeneralMerchandise T7R2_GeneralMerchandise; merge GeneralMerchandise(in=a) igeo; by abi; if a; output GeneralMerchandise; output T7R2_GeneralMerchandise; run; data Unclassified ; merge Unclassified(in=a) igeo; by abi; if a; run; *** Finally, create output data sets for each of the six store types. ; *** With each store type, remove duplicate rows and chain store rows. ; *** Create output data set for General Merchandise. ; data T7R3_GeneralMerchandise; set GeneralMerchandise; if not (matchcode in ("1","2","3","4")) then delete; run; data dedup1a; set GeneralMerchandise; where companyname like "WALMART%" or companyname like "SUPER_WALMART%" or companyname like "SUPER_WAL_MART%" or companyname like "WAL_MART*" ; run; data dedup1a; set dedup1a; length dedupbrand $ 30; dedupbrand="WALMART"; run; data dedup1b; set GeneralMerchandise; where companyname like "KMART%" or companyname = "SUPERKMART" or companyname like "SUPER_KMART" or companyname = "SUPERK-MART" or companyname like "SUPERK-MART" or companyname like "SUPER_K-MART" or companyname like "K-MART%" ; run; data dedup1b; set dedup1b; length dedupbrand $ 30; dedupbrand="KMART"; run; data dedup1c; set GeneralMerchandise; where companyname like "TARGET" or companyname like "SUPERTARGET" or companyname like "SUPER_TARGET" or companyname like "TARGETSUPER%" or companyname like "TARGET_SUPER%" ; run; data dedup1c; set dedup1c; length dedupbrand $ 30; dedupbrand="TARGET"; run; data dedup1d; set GeneralMerchandise; where companyname like "SAMSCLUB" or companyname like "SAMS_CLUB" or companyname like "SAM_S_CLUB" ; run; data dedup1d; set dedup1d; length dedupbrand $ 30; dedupbrand="SAMS"; run; data dedup1e; set GeneralMerchandise; where companyname like "BJ'SWHOLESALE%" or companyname like "BJSWHOLESALE%" or companyname like "BJ'S_WHOLESALE%" or companyname like "BJS_WHOLESALE%" ; run; data dedup1e; set dedup1e; length dedupbrand $ 30; dedupbrand="BJS"; run; data dedup1; set dedup1a dedup1b dedup1c dedup1d dedup1e; run; proc sort data=dedup1; by abi; run; data others; set GeneralMerchandise; length brand $ 30; brand=""; if companyname =: "MEIJER" then brand="MEIJER"; else if companyname =: "COSCO" or companyname =: "COSTCO" then brand="COSTCO"; run; data proj.GeneralMerchandise(drop=dedupbrand); merge others(in=a) dedup1(in=b keep=abi dedupbrand); by abi; if a; if b then brand=dedupbrand; format _all_; run; *** Create output data set for Convenience Stores. ; data ConvenienceStores; set ConvenienceStores; length word1-word3 $ 24; word1=scan(address,1); word2=scan(address,2); word3=scan(address,3); run; title3"qc: check convnamechain is not missing"; proc freq; tables convnamechain; run; title3; proc sort data=ConvenienceStores; by address cityname statealpha digitzipcode; run; proc means data=ConvenienceStores noprint; by address cityname statealpha digitzipcode; var convnamechain; output out=outcsa min=min_chain max=max_chain n=count; run; data outcsa(keep=address cityname statealpha digitzipcode count min_chain max_chain); set outcsa; if count>1; run; data T7R2_ConvenienceStores(drop=count); merge ConvenienceStores(in=a) outcsa(in=b); by address cityname statealpha digitzipcode; if a; if b then mult_sto=count-1; else mult_sto=0; run; data T7R3_ConvenienceStores; set T7R2_ConvenienceStores; if not (matchcode in ("1","2","3","4")) then delete; run; data T7R4_ConvenienceStores; set T7R3_ConvenienceStores; if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete; run; proc sort data=T7R4_ConvenienceStores nodupkey out=T7R5_ConvenienceStores(drop=word1 word2 word3); by companyname cityname statealpha digitzipcode word1 word2 word3; run; data proj.ConvenienceStores; set T7R5_ConvenienceStores; deleteflag=0; if statealpha="AK" or statealpha="HI" then delete; format _all_; run; *** Create output data set for Liquor Stores. ; data T7R2_LiquorStores; set LiquorStores; length word1-word3 $ 24; word1=scan(address,1); word2=scan(address,2); word3=scan(address,3); word_r=ranuni(234); run; data T7R3_LiquorStores; set T7R2_LiquorStores; if not (matchcode in ("1","2","3","4")) then delete; run; *** Are there any stores duplicated at location because of multiple addresses that are exactly the same? ; *** De-duplicate of same addresses ; proc sort data=T7R3_LiquorStores nodupkey out=T7R4_LiquorStores; by address cityname statealpha digitzipcode; run; *** Drop duplicates with same company name and first 3 "words" of the address field, city, state, zip ; proc sort data=T7R4_LiquorStores nodupkey out=T7R5_LiquorStores(drop=word1 word2 word3 word_r); by companyname cityname statealpha digitzipcode word1 word2 word3; run; data proj.LiquorStores; set T7R5_LiquorStores; if statealpha="AK" or statealpha="HI" then delete; format _all_; run; *** Create output data set for Pharmacies. ; data Pharmacies; set Pharmacies; length word1-word3 $ 24; word1=scan(address,1); word2=scan(address,2); word3=scan(address,3); run; title3"qc: check phnamechain is not missing"; proc freq; tables phnamechain; run; title3; proc sort data=Pharmacies; by address cityname statealpha digitzipcode; run; proc means data=Pharmacies noprint; by address cityname statealpha digitzipcode; var phnamechain; output out=outpa min=min_chain max=max_chain n=count; run; data outpa(keep=address cityname statealpha digitzipcode count min_chain max_chain); set outpa; if count>1; run; data T7R2_Pharmacies(drop=count); merge Pharmacies(in=a) outpa(in=b); by address cityname statealpha digitzipcode; if a; if b then mult_sto=count-1; else mult_sto=0; run; data T7R3_Pharmacies; set T7R2_Pharmacies; if not (matchcode in ("1","2","3","4")) then delete; run; data T7R4_Pharmacies; set T7R3_Pharmacies; if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete; run; proc sort data=T7R4_Pharmacies nodupkey out=T7R5_Pharmacies(drop=word1 word2 word3); by companyname cityname statealpha digitzipcode word1 word2 word3; run; data proj.Pharmacies; set T7R5_Pharmacies; if statealpha="AK" or statealpha="HI" then delete; format _all_; run; *** Create output data set for Supermarkets and Grocery Stores. ; data SupermarketsGrocery; set SupermarketsGrocery; length word1-word3 $ 24; word1=scan(address,1); word2=scan(address,2); word3=scan(address,3); run; title3"qc: check phnamechain is not missing"; proc freq; tables phnamechain; run; title3; proc sort data=SupermarketsGrocery; by address cityname statealpha digitzipcode; run; proc means data=SupermarketsGrocery noprint; by address cityname statealpha digitzipcode; var type_reg; output out=outsga min=min_chain max=max_chain n=count; run; data outsga(keep=address cityname statealpha digitzipcode count min_chain max_chain); set outsga; if count>1; run; data T7R2_SupermarketsGrocery(drop=count); merge SupermarketsGrocery(in=a) outsga(in=b); by address cityname statealpha digitzipcode; if a; if b then mult_sto=count-1; else mult_sto=0; run; data T7R3_SupermarketsGrocery; set T7R2_SupermarketsGrocery; if not (matchcode in ("1","2","3","4")) then delete; run; data T7R4_SupermarketsGrocery; set T7R3_SupermarketsGrocery; if min_chain