/* This program was developed by Westat, Inc. under NCI contract HHSN261201800002B*/
/* Version 1.0, copyright 2020 */
/* If you publish results based on these programs, please include the following citation:
InfoUSA Measure Construction, Version 1.0 – December 2020;
Division of Cancer Control and Population Sciences, National Cancer Institute.
*/
/* The program is distributed under the terms of the GNU General Public License:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details .
*/
/**********************************************************************************************************
PROGRAM:
#01_ImportInfoUSA.sas
OBJECTIVE:
Measure Construction for InfoUSA data.
This program follows the original STATA script in the appendix of the documentation
for the Weight and Veterans' Environments Study.
INPUT:
RetailerNameLists.xlsx
- Excel workbook with list of business names used in the original STATA script. The business names
were originally coded directly into the STATA script; by storing the business names in Excel
we can more easily add/delete business names as well improve readability of the SAS program.
Order_1082862.xlsx
- Excel file with Colorado data provided by InfoUSA.
info_geo.xlsx
- Excel file with geocode information for each record in the InfoUSA file.
OUTPUT:
GeneralMerchandise.sas7bdat
ConvenienceStores.sas7bdat
LiquorStores.sas7bdat
Pharmacies.sas7bdat
SupermarketsGrocery.sas7bdat
Unclassified.sas7bdat
Note: Script will need to be adjusted with project-specific file names and locations
**********************************************************************************************************/
options mprint mlogic;
libname proj "FILEPATH";
%macro ImportXLSXListIUSA(src=,byvar=,list=,invar=);
**** This macro is used to import the list of business names from a single worksheet from the RetailerNameLists.xlsx workbook. ;
**** The business names in the InfoUSA worksheet are formatted to consider multiple matching conditions ;
**** from the original STATA script. After import, the list of business is compared to the InfoUSA source data set and
**** a variable, &invar, is created to represent whether the InfoUSA business name meets ;
**** any of the matching conditions of the business name from the Excel sheet. ;
*** Import a specified Excel worksheet of business names. ;
proc import datafile="FILEPATH\RetailerNameLists.xlsx"
out=rlist
dbms=xlsx
replace;
sheet="&list.";
run;
*** Convert STATA syntax to SAS syntax. ;
data rlist;
length retailer $50;
set rlist;
retailer=translate(retailer,'%','*');
retailer=translate(retailer,'_','?');
obs=_N_;
run;
*** Build lines of code using the business names. ;
%let wherestring1=;
%let wherestring2=;
proc sql noprint ;
select 'cn like "' || left(trim(retailer)) || '" ' into :wherestring1 separated by ' or '
from rlist where obs<=180;
select 'or cn like "' || left(trim(retailer)) || '" ' into :wherestring2 separated by ' '
from rlist where obs>180;
quit;
*** Starting with the InfoUSA source data, create a subset data set that contains records that ;
*** company names that match the business names from the Excel list. ;
proc sort data=&src.;
by &byvar.;
run;
data listcond;
set &src.(rename=(companyname=cn));
where
&wherestring1.
&wherestring2.
;
run;
*** Merge the subset of matching business names back to the source InfoUSA data set.;
*** Create a binary indicator to show whether or not a given company successfully matched with ;
*** a business name from the Excel sheet.;
data &src.;
merge &src.(in=a) listcond(in=b keep=&byvar.);
by &byvar.;
if a;
if b then &invar.=1;
else &invar.=0;
run;
%mend ImportXLSXListIUSA;
****** Input files ;
****** Original file from InfoUSA ;
%let input1 = FILEPATH\Order_1082862.xlsx ;
****** File with geocode information ;
%let input2 = FILEPATH\info_geo.xlsx ;
proc import datafile="&input1."
out=InfoUSA
dbms=xlsx
replace;
run;
**** Reduce InfoUSA data files to only necessary variables for making measures.;
data InfoUSA;
set InfoUSA(rename=(
VAR17=EmpSize5Location
VAR18=EmpSize6Corporate
VAR23=SalesVol5Location
VAR24=SalesVol6Corporate
SIC6_Descriptions__PRIMARYSIC_=PrimarySicDescription));
rename
Company=companyname
address_line_1=address
city=cityname
state=statealpha
zipcode=digitzipcode
Primary_SIC_Code=PrimarySic
;
drop
Landmark_:
Mailing_:
Unit_:
Gender
Call_Status_Code
Subsidiary_Number--Census_Block
NAICS_Code
Primary_NAICS_Code
NAICS8_Descriptions
SIC_:
SIC6:
;
actualsalesvolume=input(Parent_Actual_Sales_Volume,9.);
run;
**** Check for duplicates;
proc sql noprint;
create table InfoUSADuplicates as
select companyname,address,cityname,statealpha,digitzipcode, count(*) as nObs
from InfoUSA
group by companyname,address,cityname,statealpha,digitzipcode
having count(*)>1;
quit;
title2"QC1: Duplicates in InfoUSA file based on ";
title3"companyname,address,cityname,statealpha,digitzipcode";
proc print data=InfoUSADuplicates;
var companyname address cityname statealpha digitzipcode nObs;
run;
title3;
title2;
proc sort data=InfoUSA out=qc nodupkey;
by ABI;
run;
**** This portion of the file separates different store types for measure construction ;
data InfoUSA;
set InfoUSA;
companyname=upcase(companyname);
*** Count field value 1 required to make density maps;
count=1;
*** Liquor Stores ;
liquor = 0;
if primarysic in ("592102" "592103" "592104") then liquor = 1;
run;
*** General Merchandise;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 1, invar=in1);
data InfoUSA(drop=in1);
set InfoUSA;
if in1 and
not(companyname in ("WAL-MART NEIGHBORHOOD MARKET","WALMART NEIGHBORHOOD MARKET","COSCOB SPORTS MEDICINE")) then GenMerch=1;
else GenMerch=0;
run;
*** Pharmacies. Initially set pharmacy type based on SIC code. ;
data InfoUSA;
set InfoUSA;
sic4=substr(PrimarySic,1,4);
if sic4="5912" then pharm=1;
else pharm=0;
run;
*** Set pharmacy type to 0 if it is part of a general merchandise store.;
*** We will set pharmacy type to 0 for grocery stores in a later step.;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 2, invar=in2);
data InfoUSA(drop=in2);
set InfoUSA;
if in2 and genmerch=1 and pharm=1 then pharm=0;
run;
*** The next section creates duppharmgenmerch and duplatlonggenmerch, which are indicators of duplicates.;
proc freq data=InfoUSA noprint;
tables address * cityname * statealpha * zip4 / out=outd1;
tables latitude * longitude / out=outd2;
where genmerch=1;
run;
data outd1;
set outd1;
if count>1;
run;
data outd2;
set outd2;
if count>1;
run;
proc sort data=InfoUSA(drop=count);
by address cityname statealpha zip4;
run;
data InfoUSA(drop=count);
merge InfoUSA(in=a) outd1(in=b keep=address cityname statealpha zip4 count);
by address cityname statealpha zip4;
if a;
if b then duppharmgenmerch=count-1;
run;
proc sort data=InfoUSA;
by latitude longitude;
run;
data InfoUSA(drop=count);
merge InfoUSA(in=a) outd2(in=b keep=latitude longitude count);
by latitude longitude;
if a;
if b then duplatlonggenmerch=count-1;
run;
*** Convenience stores. Initially set convenience store type based on SIC code. ;
data InfoUSA;
set InfoUSA;
if primarysic in ("544103","554101","554103") then conv=1;
else conv=0;
run;
*** We will check to see if the convenience stores are actually pharmacies and ;
*** unset the flag if found in a list of pharmacies. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 3, invar=in3);
data InfoUSA(drop=in3);
set InfoUSA;
if in3 and conv=1 then conv=0;
run;
*** Set pharmacy flag for every pharmacy found under convenience stores.;
data InfoUSA;
set InfoUSA;
if conv=0 and primarysic in ("541103" "554101" "554103") then pharm=1;
run;
*** The next section checks is any of the convenience stores are in a list of ;
*** chain supermarkets or general merchandise. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 4, invar=in4);
data listcond4alt;
set InfoUSA;
where companyname like 'SHOP%SAVE';
run;
data InfoUSA;
merge InfoUSA(in=a) listcond4alt(in=b keep=abi);
by abi;
if a;
if b then in4=1;
run;
data InfoUSA(drop=in4);
set InfoUSA;
if in4 and conv=1 then conv=0;
if companyname = "BIALOWIESKI MARKET" then conv=1;
if companyname = "BILO GAS N GO" then conv=1;
run;
*** Supermarkets and grocery stores. Initially set supgroc indicator based on SIC code. ;
*** But unset supgroc indicator if it is already a general merchandise store.;
data InfoUSA;
set InfoUSA;
if primarysic in ("541101","541102","541104","541105","541106","541107","541108","541109") then supgroc=1;
if genmerch=1 then supgroc=0;
run;
*** Unset supgroc indicator if it is in a list of pharmacies;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 5, invar=in5);
data InfoUSA(drop=in5);
set InfoUSA;
if in5 and supgroc=1 then pharm=1;
if pharm=1 then supgroc=0;
run;
*** Unset supgroc indicator if it is in a list of convenience stores.;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 6, invar=in6);
data InfoUSA(drop=in6);
set InfoUSA;
if in6 and supgroc=1 then conv=1;
if conv=1 then supgroc=0;
*** Create a whole_sale indicator;
whole_sale=0;
if companyname="BOZZUTO'S" then whole_sale=1;
run;
*********************************************;
****CHAINS: AFTER CLASSIFICATION, pull chains;
*********************************************;
data InfoUSA ;
set InfoUSA ;
***Supermarkets/Grocery Stores;
***Flag stores with > $2 million in annual sales. This will only be useful for Supermarkets/Grocery Stores;
twomil = 0;
if actualsalesvolume>1999 & actualsalesvolume ne . then twomil=1;
***Create Regional 11+ designator - shows stores that have 11 or more stores in the region = FMI definition of Supermarket;
region=0;
if statealpha="CT" or statealpha="ME" or statealpha="MA" or statealpha="NH" or statealpha="RI" or statealpha="VT" then region=1;
if statealpha="NJ" or statealpha="NY" or statealpha="PA" then region=2;
if statealpha="IN" or statealpha="IL" or statealpha="MI" or statealpha="OH" or statealpha="WI" then region=3;
if statealpha="IA" or statealpha="KS" or statealpha="MN" or statealpha="MO" or statealpha="NE" or statealpha="ND" or statealpha="SD" then region=4;
if statealpha="DE" or statealpha="DC" or statealpha="FL" or statealpha="GA" or statealpha="MD" or statealpha="NC" or statealpha="SC" or statealpha="VA" or statealpha="WV" then region=5;
if statealpha="AL" or statealpha="KY" or statealpha="MS" or statealpha="TN" then region=6;
if statealpha="AR" or statealpha="LA" or statealpha="OK" or statealpha="TX" then region=7;
if statealpha="AZ" or statealpha="CO" or statealpha="ID" or statealpha="NM" or statealpha="MT" or statealpha="UT" or statealpha="NV" or statealpha="WY" then region=8;
if statealpha="AK" or statealpha="CA" or statealpha="HI" or statealpha="OR" or statealpha="WA" then region=9;
run;
proc freq data=InfoUSA noprint;
tables CompanyName * region / out=outcr;
run;
data outcr;
set outcr;
if count>=11;
run;
proc sort data=InfoUSA;
by companyname region;
run;
data InfoUSA;
merge InfoUSA(in=a) outcr(in=b keep=companyname region);
by companyname region;
if a;
if b then elevenregion=1;
else elevenregion=0;
run;
proc sort data=InfoUSA;
by abi;
run;
*** Flag stores that are chain stores. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 7, invar=in7);
data listcond7alt;
set InfoUSA;
where companyname like 'SHOP%SAVE';
run;
data InfoUSA;
merge InfoUSA(in=a) listcond7alt(in=b keep=abi);
by abi;
if a;
if b then in7=1;
run;
data InfoUSA(drop=in7);
set InfoUSA;
sgnamechain=0;
if in7 and supgroc=1 then sgnamechain=1;
if indexw(companyname,"IGA") and supgroc=1 then sgnamechain=1;
run;
*** Convenience stores. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 8, invar=in8);
data InfoUSA(drop=in8);
set InfoUSA;
convnamechain=0;
if in8 and conv=1 then convnamechain=1;
run;
*** Pharmacies. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 9, invar=in9);
data InfoUSA(drop=in9 untruepull);
set InfoUSA;
phnamechain=0;
if in9 and pharm=1 then phnamechain=1;
*** This variable culls those that should not have been pulled due to the name list. ;
*** What that means is that we asked for SIC code 591205, and then for a name search ;
*** to be performed for the above list of names on SIC4 5912. So if there is a record ;
*** that has a SIC4 of 5912 but does not have a name on this list, it was not pulled as a pharmacy.;
if phnamechain=0 & primarysic ne "591205" then untruepull=1;
if untruepull=1 then pharm=0;
run;
*** Generate a flag genmerchmem for specific general merchandise. ;
data genmerchmem;
set InfoUSA;
where genmerch=1 and
(
companyname like "COSCO%" or
companyname like "COSTCO%" or
companyname like "SAM_S_CLUB" or
companyname like "BJ'S_WHOLESALE%" or
companyname like "BJS_WHOLESALE%"
);
run;
data InfoUSA;
merge InfoUSA(in=a) genmerchmem(in=b keep=abi);
by abi;
if a;
if b then genmerchmem=1;
run;
*** Supermarkets are "typed" at this point, so that if in classifying other store types something was turned into a supermarket, it still gets a type. ;
proc format;
value classif
1="<$2 Mill, No Chain Name, <11 Stores"
2="<$2 Mill, No Chain Name, 11+ Stores"
3="<$2 Mill, Chain Name, <11 Stores"
4="<$2 Mill, Chain Name, 11+ Stores"
5="$2+ Mill, No Chain Name, <11 Stores"
6="$2+ Mill, No Chain Name, 11+ Stores"
7="$2+ Mill, Chain Name, <11 Stores"
8="$2+ Mill, Chain Name, 11+ Stores";
run;
data InfoUSA;
set InfoUSA;
type_reg=0;
if twomil=0 & sgnamechain=0 & elevenregion=0 then type_reg=1;
if twomil=0 & sgnamechain=0 & elevenregion=1 then type_reg=2;
if twomil=0 & sgnamechain=1 & elevenregion=0 then type_reg=3;
if twomil=0 & sgnamechain=1 & elevenregion=1 then type_reg=4;
if twomil=1 & sgnamechain=0 & elevenregion=0 then type_reg=5;
if twomil=1 & sgnamechain=0 & elevenregion=1 then type_reg=6;
if twomil=1 & sgnamechain=1 & elevenregion=0 then type_reg=7;
if twomil=1 & sgnamechain=1 & elevenregion=1 then type_reg=8;
format type_reg classif.;
run;
*** It was found that many records had PO Boxes in the address fields.;
*** Flag PO Boxes by Store Type ;
data pobox;
set InfoUSA;
where
address like "PO BOX%" or
address like "%PO BOX%" or
address like "POST OFFICE BOX%" or
address like "RR BOX%" or
address like "%RR BOX%" or
address like "RURAL ROUTE BOX%" or
address like "HC___BOX%" or
address like "HC____BOX%" or
address like "HC_____BOX%" or
strip(address) like "HCBOX%"
;
run;
data InfoUSA;
merge InfoUSA(in=a) pobox(in=b keep=abi);
by abi;
if a;
pobox=0;
if b then pobox=1;
run;
title2"Total Raw Count reports - by PO Box";
title3"Overall";
proc freq data=InfoUSA;
tables pobox ;
run;
title3"Supermarkets/Grocery";
proc freq data=InfoUSA;
tables pobox ;
where supgroc=1;
run;
title3"Convenience Stores";
proc freq data=InfoUSA;
tables pobox ;
where conv=1;
run;
title3"Pharmacies";
proc freq data=InfoUSA;
tables pobox ;
where pharm=1;
run;
title3"Liquor Stores";
proc freq data=InfoUSA;
tables pobox ;
where liquor=1;
run;
title3"General Merchandise";
proc freq data=InfoUSA;
tables pobox ;
where genmerch=1;
run;
title3;
title2;
data T7R1_GeneralMerchandise T7R1_Liquor T7R1_Pharmacy T7R1_Convenience T7R1_SupermarketGrocery;
set InfoUSA;
if genmerch=1 then output T7R1_GeneralMerchandise;
if liquor=1 then output T7R1_Liquor;
if pharm=1 then output T7R1_Pharmacy;
if conv=1 then output T7R1_Convenience;
if supgroc=1 then output T7R1_SupermarketGrocery;
run;
title2"Housekeeping reports: based on address cityname statealpha zip4";
title3"Duplicate Supermarkets";
proc sql;
select address,cityname,statealpha,zip4, count(*) as n
from InfoUSA
where supgroc=1
group by address,cityname,statealpha,zip4
having count(*)>1;
quit;
title3;
title3"Duplicate General Merchandise/Grocery";
proc sql;
select address,cityname,statealpha,zip4, count(*) as n
from InfoUSA
where genmerch=1
group by address,cityname,statealpha,zip4
having count(*)>1;
quit;
title3;
title3"Duplicate Convenience Stores";
proc sql;
select address,cityname,statealpha,zip4, count(*) as n
from InfoUSA
where conv=1
group by address,cityname,statealpha,zip4
having count(*)>1;
quit;
title3;
title3"Duplicate Pharmacies";
proc sql;
select address,cityname,statealpha,zip4, count(*) as n
from InfoUSA
where pharm=1
group by address,cityname,statealpha,zip4
having count(*)>1;
quit;
title3;
title3"Duplicate Liquor";
proc sql;
select address,cityname,statealpha,zip4, count(*) as n
from InfoUSA
where liquor=1
group by address,cityname,statealpha,zip4
having count(*)>1;
quit;
title3;
title2;
title2 "Cross-Classifications between General Merchandise and Pharmacy";
proc print data=InfoUSA;
where genmerch=1 & pharm=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Liquor";
proc print data=InfoUSA;
where genmerch=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Convenience";
proc print data=InfoUSA;
where genmerch=1 & conv=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Supermarkets/Groceries";
proc print data=InfoUSA;
where genmerch=1 & supgroc=1;
run;
title2;
title2"Stores Cross-Classified as General Merchandise and Pharmacy will (quietly) be made General Merchandise";
proc print data=InfoUSA;
var companyname;
where pharm=1 and genmerch=1;
run;
title2;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 10, invar=in10);
data InfoUSA(drop=in10);
set InfoUSA;
if pharm=1 and genmerch=1 and in10 then pharm=0;
run;
title2"Stores Cross-Classified as General Merchandise and Liquor will (quietly) be made General Merchandise";
data cc2;
set InfoUSA;
where liquor=1 and genmerch=1 and
(companyname like 'COSCO%' or
companyname like 'COSTCO%');
run;
proc print data=cc2;
var companyname;
where liquor=1 and genmerch=1;
run;
title2;
data InfoUSA;
merge InfoUSA(in=a) cc2(in=b keep=ABI);
by ABI;
if a;
if b then liquor=0;
run;
title2;
title2 "Cross-Classifications between Pharmacies and Liquor";
proc print data=InfoUSA n;
var companyname;
where pharm=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between Pharmacies and Convenience";
proc print data=InfoUSA n;
var companyname;
where pharm=1 & conv=1;
run;
title2;
title2 "Cross-Classifications between Pharmacies and Supermarkets/Groceries";
proc print data=InfoUSA n;
var companyname;
where pharm=1 & supgroc=1;
run;
title2;
title2 "Cross-Classifications between Convenience and Liquor";
proc print data=InfoUSA n;
var companyname;
where conv=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between Convenience and Supermarkets/Groceries";
proc print data=InfoUSA n;
var companyname;
where conv=1 & supgroc=1;
run;
title2;
title2 "Cross-Classifications between Liquor and Supermarkets/Groceries";
proc print data=InfoUSA n;
var companyname;
where liquor=1 & supgroc=1;
run;
title2;
*** What are we not classifying? There are many not classified at this point, including all the PAFacilities. ;
proc format;
value type
0="not classified"
1="general merchandise"
2="liquor"
3="pharmacy"
4="convenient store"
5="grocery/super market";
run;
data InfoUSA;
set InfoUSA;
typestore=0;
if genmerch=1 then typestore=1;
if liquor=1 then typestore=2;
if pharm=1 then typestore=3;
if conv=1 then typestore=4;
if supgroc=1 then typestore=5;
label typestore="Macro store type";
format typestore type.;
run;
title2"all stores by type";
proc freq data=InfoUSA;
tables typestore;
run;
*** In this section, individual store type files are made. This is important because deduplication is done within store types, ;
*** so they have to be separated out, first. ;
title2"Supermarket/Grocery Stores";
data SupermarketsGrocery;
set InfoUSA;
if supgroc=1;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2"Pharmacies";
data Pharmacies;
set InfoUSA;
if pharm=1;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2"Convenience Stores";
data ConvenienceStores;
set InfoUSA;
if conv=1;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2"Liquor Stores";
data LiquorStores;
set InfoUSA;
if Liquor=1;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2"General Merchandise Stores";
data GeneralMerchandise;
set InfoUSA;
if genmerch=1;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2"PA Facilites/Unclassified";
data Unclassified;
set InfoUSA;
if typestore=0;
run;
proc freq;
tables primarysic primarysicdescription;
run;
title2;
*** In the original script, PO Box Geocodes were replaced with closest address Geocodes. ;
*** For this script, we will add on closest address Geocode information for all records, not just PO Boxes. ;
proc import datafile="&input2."
out=igeo(keep=abi Loc_name Match_addr X Y dist_mi dist_m TRACT18
rename=(abi=abin))
dbms=xlsx
replace;
run;
data igeo(drop=abin);
set igeo;
length abi $9;
abi=put(abin,9.);
length Loc_Name $12;
if Loc_name = 'PointAddress' then Geocode_Level = '1-PointAddr';
else if Loc_name = 'StreetAddress' then Geocode_Level = '2-StreetAddr';
else if Loc_name = 'PostalExt' then Geocode_Level = '4-9digitZIP';
else if Loc_name = 'StreetName' then Geocode_Level = '5-StreetName';
else if Loc_name = 'Postal' then Geocode_Level = '6-5digitZIP';
else if Loc_name = 'AdminPlaces' then Geocode_Level = '7-AdminPlace';
else Geocode_Level = '9-' || Loc_name;
length MatchCode $1;
MatchCode = substr(Geocode_Level,1,1);
run;
proc sort data=igeo;
by abi;
run;
data SupermarketsGrocery T7R2_SupermarketsGrocery;
merge SupermarketsGrocery(in=a) igeo;
by abi;
if a;
output SupermarketsGrocery; output T7R2_SupermarketsGrocery;
run;
data Pharmacies T7R2_Pharmacies;
merge Pharmacies(in=a) igeo;
by abi;
if a;
output Pharmacies; output T7R2_Pharmacies;
run;
data ConvenienceStores T7R2_ConvenienceStores;
merge ConvenienceStores(in=a) igeo;
by abi;
if a;
output ConvenienceStores; output T7R2_ConvenienceStores;
run;
data LiquorStores T7R2_LiquorStores;
merge LiquorStores(in=a) igeo;
by abi;
if a;
output LiquorStores; output T7R2_LiquorStores;
run;
data GeneralMerchandise T7R2_GeneralMerchandise;
merge GeneralMerchandise(in=a) igeo;
by abi;
if a;
output GeneralMerchandise; output T7R2_GeneralMerchandise;
run;
data Unclassified ;
merge Unclassified(in=a) igeo;
by abi;
if a;
run;
*** Finally, create output data sets for each of the six store types. ;
*** With each store type, remove duplicate rows and chain store rows. ;
*** Create output data set for General Merchandise. ;
data T7R3_GeneralMerchandise;
set GeneralMerchandise;
if not (matchcode in ("1","2","3","4")) then delete;
run;
data dedup1a;
set GeneralMerchandise;
where
companyname like "WALMART%" or
companyname like "SUPER_WALMART%" or
companyname like "SUPER_WAL_MART%" or
companyname like "WAL_MART*" ;
run;
data dedup1a;
set dedup1a;
length dedupbrand $ 30;
dedupbrand="WALMART";
run;
data dedup1b;
set GeneralMerchandise;
where
companyname like "KMART%" or
companyname = "SUPERKMART" or
companyname like "SUPER_KMART" or
companyname = "SUPERK-MART" or
companyname like "SUPERK-MART" or
companyname like "SUPER_K-MART" or
companyname like "K-MART%" ;
run;
data dedup1b;
set dedup1b;
length dedupbrand $ 30;
dedupbrand="KMART";
run;
data dedup1c;
set GeneralMerchandise;
where
companyname like "TARGET" or
companyname like "SUPERTARGET" or
companyname like "SUPER_TARGET" or
companyname like "TARGETSUPER%" or
companyname like "TARGET_SUPER%" ;
run;
data dedup1c;
set dedup1c;
length dedupbrand $ 30;
dedupbrand="TARGET";
run;
data dedup1d;
set GeneralMerchandise;
where
companyname like "SAMSCLUB" or
companyname like "SAMS_CLUB" or
companyname like "SAM_S_CLUB" ;
run;
data dedup1d;
set dedup1d;
length dedupbrand $ 30;
dedupbrand="SAMS";
run;
data dedup1e;
set GeneralMerchandise;
where
companyname like "BJ'SWHOLESALE%" or
companyname like "BJSWHOLESALE%" or
companyname like "BJ'S_WHOLESALE%" or
companyname like "BJS_WHOLESALE%"
;
run;
data dedup1e;
set dedup1e;
length dedupbrand $ 30;
dedupbrand="BJS";
run;
data dedup1; set dedup1a dedup1b dedup1c dedup1d dedup1e; run;
proc sort data=dedup1; by abi; run;
data others;
set GeneralMerchandise;
length brand $ 30;
brand="";
if companyname =: "MEIJER" then brand="MEIJER";
else if companyname =: "COSCO" or companyname =: "COSTCO" then brand="COSTCO";
run;
data proj.GeneralMerchandise(drop=dedupbrand);
merge others(in=a) dedup1(in=b keep=abi dedupbrand);
by abi;
if a;
if b then brand=dedupbrand;
format _all_;
run;
*** Create output data set for Convenience Stores. ;
data ConvenienceStores;
set ConvenienceStores;
length word1-word3 $ 24;
word1=scan(address,1);
word2=scan(address,2);
word3=scan(address,3);
run;
title3"qc: check convnamechain is not missing";
proc freq;
tables convnamechain;
run;
title3;
proc sort data=ConvenienceStores;
by address cityname statealpha digitzipcode;
run;
proc means data=ConvenienceStores noprint;
by address cityname statealpha digitzipcode;
var convnamechain;
output out=outcsa min=min_chain max=max_chain n=count;
run;
data outcsa(keep=address cityname statealpha digitzipcode count min_chain max_chain);
set outcsa;
if count>1;
run;
data T7R2_ConvenienceStores(drop=count);
merge ConvenienceStores(in=a) outcsa(in=b);
by address cityname statealpha digitzipcode;
if a;
if b then mult_sto=count-1;
else mult_sto=0;
run;
data T7R3_ConvenienceStores;
set T7R2_ConvenienceStores;
if not (matchcode in ("1","2","3","4")) then delete;
run;
data T7R4_ConvenienceStores;
set T7R3_ConvenienceStores;
if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete;
run;
proc sort data=T7R4_ConvenienceStores nodupkey out=T7R5_ConvenienceStores(drop=word1 word2 word3);
by companyname cityname statealpha digitzipcode word1 word2 word3;
run;
data proj.ConvenienceStores;
set T7R5_ConvenienceStores;
deleteflag=0;
if statealpha="AK" or statealpha="HI" then delete;
format _all_;
run;
*** Create output data set for Liquor Stores. ;
data T7R2_LiquorStores;
set LiquorStores;
length word1-word3 $ 24;
word1=scan(address,1);
word2=scan(address,2);
word3=scan(address,3);
word_r=ranuni(234);
run;
data T7R3_LiquorStores;
set T7R2_LiquorStores;
if not (matchcode in ("1","2","3","4")) then delete;
run;
*** Are there any stores duplicated at location because of multiple addresses that are exactly the same? ;
*** De-duplicate of same addresses ;
proc sort data=T7R3_LiquorStores nodupkey out=T7R4_LiquorStores;
by address cityname statealpha digitzipcode;
run;
*** Drop duplicates with same company name and first 3 "words" of the address field, city, state, zip ;
proc sort data=T7R4_LiquorStores nodupkey out=T7R5_LiquorStores(drop=word1 word2 word3 word_r);
by companyname cityname statealpha digitzipcode word1 word2 word3;
run;
data proj.LiquorStores;
set T7R5_LiquorStores;
if statealpha="AK" or statealpha="HI" then delete;
format _all_;
run;
*** Create output data set for Pharmacies. ;
data Pharmacies;
set Pharmacies;
length word1-word3 $ 24;
word1=scan(address,1);
word2=scan(address,2);
word3=scan(address,3);
run;
title3"qc: check phnamechain is not missing";
proc freq;
tables phnamechain;
run;
title3;
proc sort data=Pharmacies;
by address cityname statealpha digitzipcode;
run;
proc means data=Pharmacies noprint;
by address cityname statealpha digitzipcode;
var phnamechain;
output out=outpa min=min_chain max=max_chain n=count;
run;
data outpa(keep=address cityname statealpha digitzipcode count min_chain max_chain);
set outpa;
if count>1;
run;
data T7R2_Pharmacies(drop=count);
merge Pharmacies(in=a) outpa(in=b);
by address cityname statealpha digitzipcode;
if a;
if b then mult_sto=count-1;
else mult_sto=0;
run;
data T7R3_Pharmacies;
set T7R2_Pharmacies;
if not (matchcode in ("1","2","3","4")) then delete;
run;
data T7R4_Pharmacies;
set T7R3_Pharmacies;
if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete;
run;
proc sort data=T7R4_Pharmacies nodupkey out=T7R5_Pharmacies(drop=word1 word2 word3);
by companyname cityname statealpha digitzipcode word1 word2 word3;
run;
data proj.Pharmacies;
set T7R5_Pharmacies;
if statealpha="AK" or statealpha="HI" then delete;
format _all_;
run;
*** Create output data set for Supermarkets and Grocery Stores. ;
data SupermarketsGrocery;
set SupermarketsGrocery;
length word1-word3 $ 24;
word1=scan(address,1);
word2=scan(address,2);
word3=scan(address,3);
run;
title3"qc: check phnamechain is not missing";
proc freq;
tables phnamechain;
run;
title3;
proc sort data=SupermarketsGrocery;
by address cityname statealpha digitzipcode;
run;
proc means data=SupermarketsGrocery noprint;
by address cityname statealpha digitzipcode;
var type_reg;
output out=outsga min=min_chain max=max_chain n=count;
run;
data outsga(keep=address cityname statealpha digitzipcode count min_chain max_chain);
set outsga;
if count>1;
run;
data T7R2_SupermarketsGrocery(drop=count);
merge SupermarketsGrocery(in=a) outsga(in=b);
by address cityname statealpha digitzipcode;
if a;
if b then mult_sto=count-1;
else mult_sto=0;
run;
data T7R3_SupermarketsGrocery;
set T7R2_SupermarketsGrocery;
if not (matchcode in ("1","2","3","4")) then delete;
run;
data T7R4_SupermarketsGrocery;
set T7R3_SupermarketsGrocery;
if min_chain