/* This program was developed by Westat, Inc. under NCI contract HHSN261201800002B*/
/* Version 1.0, copyright 2020 */
/* If you publish results based on these programs, please include the following citation:
    InfoUSA Measure Construction, Version 1.0 – December 2020;
    Division of Cancer Control and Population Sciences, National Cancer Institute.
*/

/* The program is distributed under the terms of the GNU General Public License:

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details <http://www.gnu.org/licenses/>.
*/


/**********************************************************************************************************
PROGRAM:
   #01_ImportInfoUSA.sas

OBJECTIVE:
   Measure Construction for InfoUSA data.
   This program follows the original STATA script in the appendix of the documentation
   for the Weight and Veterans' Environments Study.

INPUT:
    RetailerNameLists.xlsx
        - Excel workbook with list of business names used in the original STATA script. The business names
          were originally coded directly into the STATA script; by storing the business names in Excel
          we can more easily add/delete business names as well improve readability of the SAS program.
    Order_1082862.xlsx
        - Excel file with Colorado data provided by InfoUSA.
    info_geo.xlsx
        - Excel file with geocode information for each record in the InfoUSA file.

OUTPUT:
    GeneralMerchandise.sas7bdat
    ConvenienceStores.sas7bdat
    LiquorStores.sas7bdat
    Pharmacies.sas7bdat
    SupermarketsGrocery.sas7bdat
    Unclassified.sas7bdat

Note: Script will need to be adjusted with project-specific file names and locations
**********************************************************************************************************/

options mprint mlogic;
libname proj "FILEPATH";

    %macro ImportXLSXListIUSA(src=,byvar=,list=,invar=);
    **** This macro is used to import the list of business names from a single worksheet from the RetailerNameLists.xlsx workbook. ;
    **** The business names in the InfoUSA worksheet are formatted to consider multiple matching conditions ;
    **** from the original STATA script. After import, the list of business is compared to the InfoUSA source data set and
    **** a variable, &invar, is created to represent whether the InfoUSA business name meets ;
    **** any of the matching conditions of the business name from the Excel sheet. ;

    *** Import a specified Excel worksheet of business names. ;
    proc import datafile="FILEPATH\RetailerNameLists.xlsx"
                out=rlist
                dbms=xlsx
                replace;
                sheet="&list.";
    run;

    *** Convert STATA syntax to SAS syntax. ;
    data rlist;
        length retailer $50;
        set rlist;
        retailer=translate(retailer,'%','*');
        retailer=translate(retailer,'_','?');
        obs=_N_;
    run;

    *** Build lines of code using the business names. ;
    %let wherestring1=;
    %let wherestring2=;
    proc sql noprint ;
        select 'cn like "' || left(trim(retailer)) || '" ' into :wherestring1 separated by ' or '
        from rlist where obs<=180;
        select 'or cn like "' || left(trim(retailer)) || '" ' into :wherestring2 separated by ' '
        from rlist where obs>180;
    quit;

    *** Starting with the InfoUSA source data, create a subset data set that contains records that ;
    *** company names that match the business names from the Excel list. ;
    proc sort data=&src.;
        by &byvar.;
    run;
    data listcond;
        set &src.(rename=(companyname=cn));
        where
            &wherestring1.
            &wherestring2.
        ;
    run;

    *** Merge the subset of matching business names back to the source InfoUSA data set.;
    *** Create a binary indicator to show whether or not a given company successfully matched with ;
    *** a business name from the Excel sheet.;
    data &src.;
        merge &src.(in=a) listcond(in=b keep=&byvar.);
        by &byvar.;
        if a;
        if b then &invar.=1;
        else &invar.=0;
    run;

    %mend ImportXLSXListIUSA;


****** Input files ;
****** Original file from InfoUSA ;
%let input1 = FILEPATH\Order_1082862.xlsx ;
****** File with geocode information ;
%let input2 = FILEPATH\info_geo.xlsx ;

proc import datafile="&input1."
            out=InfoUSA
            dbms=xlsx
            replace;
run;

**** Reduce InfoUSA data files to only necessary variables for making measures.;
data InfoUSA;
    set InfoUSA(rename=(
    VAR17=EmpSize5Location
    VAR18=EmpSize6Corporate
    VAR23=SalesVol5Location
    VAR24=SalesVol6Corporate
    SIC6_Descriptions__PRIMARYSIC_=PrimarySicDescription));
    rename
    Company=companyname
    address_line_1=address
    city=cityname
    state=statealpha
    zipcode=digitzipcode
    Primary_SIC_Code=PrimarySic
    ;
    drop
    Landmark_:
    Mailing_:
    Unit_:
    Gender
    Call_Status_Code
    Subsidiary_Number--Census_Block
    NAICS_Code
    Primary_NAICS_Code
    NAICS8_Descriptions
    SIC_:
    SIC6:
    ;
    actualsalesvolume=input(Parent_Actual_Sales_Volume,9.);
run;


**** Check for duplicates;
proc sql noprint;
    create table InfoUSADuplicates as
    select companyname,address,cityname,statealpha,digitzipcode, count(*) as nObs
    from InfoUSA
    group by companyname,address,cityname,statealpha,digitzipcode
    having count(*)>1;
quit;
title2"QC1: Duplicates in InfoUSA file based on ";
title3"companyname,address,cityname,statealpha,digitzipcode";
proc print data=InfoUSADuplicates;
    var companyname address cityname statealpha digitzipcode nObs;
run;
title3;
title2;

proc sort data=InfoUSA out=qc nodupkey;
    by ABI;
run;


**** This portion of the file separates different store types for measure construction ;
data InfoUSA;
    set InfoUSA;
    companyname=upcase(companyname);

    *** Count field value 1 required to make density maps;
    count=1;
    *** Liquor Stores ;
    liquor = 0;
    if primarysic in ("592102" "592103" "592104") then liquor = 1;
run;

*** General Merchandise;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 1, invar=in1);
data InfoUSA(drop=in1);
    set InfoUSA;
    if in1 and
       not(companyname in ("WAL-MART NEIGHBORHOOD MARKET","WALMART NEIGHBORHOOD MARKET","COSCOB SPORTS MEDICINE")) then GenMerch=1;
    else GenMerch=0;
run;


*** Pharmacies. Initially set pharmacy type based on SIC code. ;
data InfoUSA;
    set InfoUSA;
    sic4=substr(PrimarySic,1,4);
    if sic4="5912" then pharm=1;
    else pharm=0;
run;

*** Set pharmacy type to 0 if it is part of a general merchandise store.;
*** We will set pharmacy type to 0 for grocery stores in a later step.;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 2, invar=in2);
data InfoUSA(drop=in2);
    set InfoUSA;
    if in2 and genmerch=1 and pharm=1 then pharm=0;
run;

*** The next section creates duppharmgenmerch and duplatlonggenmerch, which are indicators of duplicates.;
proc freq data=InfoUSA noprint;
    tables address * cityname * statealpha * zip4 / out=outd1;
    tables latitude * longitude / out=outd2;
    where genmerch=1;
run;

data outd1;
    set outd1;
    if count>1;
run;
data outd2;
    set outd2;
    if count>1;
run;

proc sort data=InfoUSA(drop=count);
    by address cityname statealpha zip4;
run;
data InfoUSA(drop=count);
    merge InfoUSA(in=a) outd1(in=b keep=address cityname statealpha zip4 count);
    by address cityname statealpha zip4;
    if a;
    if b then duppharmgenmerch=count-1;
run;

proc sort data=InfoUSA;
    by latitude longitude;
run;
data InfoUSA(drop=count);
    merge InfoUSA(in=a) outd2(in=b keep=latitude longitude count);
    by latitude longitude;
    if a;
    if b then duplatlonggenmerch=count-1;
run;

*** Convenience stores. Initially set convenience store type based on SIC code. ;
data InfoUSA;
    set InfoUSA;
    if primarysic in ("544103","554101","554103") then conv=1;
    else conv=0;
run;

*** We will check to see if the convenience stores are actually pharmacies and ;
*** unset the flag if found in a list of pharmacies. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 3, invar=in3);
data InfoUSA(drop=in3);
    set InfoUSA;
    if in3 and conv=1 then conv=0;
run;

*** Set pharmacy flag for every pharmacy found under convenience stores.;
data InfoUSA;
    set InfoUSA;
    if conv=0 and primarysic in ("541103" "554101" "554103") then pharm=1;
run;

*** The next section checks is any of the convenience stores are in a list of ;
*** chain supermarkets or general merchandise. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 4, invar=in4);
data listcond4alt;
    set InfoUSA;
    where companyname like 'SHOP%SAVE';
run;
data InfoUSA;
    merge InfoUSA(in=a) listcond4alt(in=b keep=abi);
    by abi;
    if a;
    if b then in4=1;
run;
data InfoUSA(drop=in4);
    set InfoUSA;
    if in4 and conv=1 then conv=0;
    if companyname = "BIALOWIESKI MARKET" then conv=1;
    if companyname = "BILO GAS N GO" then conv=1;
run;

*** Supermarkets and grocery stores. Initially set supgroc indicator based on SIC code. ;
*** But unset supgroc indicator if it is already a general merchandise store.;
data InfoUSA;
    set InfoUSA;
    if primarysic in ("541101","541102","541104","541105","541106","541107","541108","541109") then supgroc=1;
    if genmerch=1 then supgroc=0;
run;

*** Unset supgroc indicator if it is in a list of pharmacies;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 5, invar=in5);
data InfoUSA(drop=in5);
    set InfoUSA;
    if in5 and supgroc=1 then pharm=1;
    if pharm=1 then supgroc=0;
run;

*** Unset supgroc indicator if it is in a list of convenience stores.;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 6, invar=in6);
data InfoUSA(drop=in6);
    set InfoUSA;
    if in6  and supgroc=1 then conv=1;
    if conv=1 then supgroc=0;
    *** Create a whole_sale indicator;
    whole_sale=0;
    if companyname="BOZZUTO'S" then whole_sale=1;
run;


*********************************************;
****CHAINS: AFTER CLASSIFICATION, pull chains;
*********************************************;

data InfoUSA ;
    set InfoUSA ;

    ***Supermarkets/Grocery Stores;
    ***Flag stores with > $2 million in annual sales. This will only be useful for Supermarkets/Grocery Stores;
    twomil = 0;
    if actualsalesvolume>1999 & actualsalesvolume ne . then twomil=1;

    ***Create Regional 11+ designator - shows stores that have 11 or more stores in the region = FMI definition of Supermarket;
    region=0;
    if statealpha="CT" or statealpha="ME" or statealpha="MA" or statealpha="NH" or statealpha="RI" or statealpha="VT" then region=1;
    if statealpha="NJ" or statealpha="NY" or statealpha="PA" then region=2;
    if statealpha="IN" or statealpha="IL" or statealpha="MI" or statealpha="OH" or statealpha="WI" then region=3;
    if statealpha="IA" or statealpha="KS" or statealpha="MN" or statealpha="MO" or statealpha="NE" or statealpha="ND" or statealpha="SD" then region=4;
    if statealpha="DE" or statealpha="DC" or statealpha="FL" or statealpha="GA" or statealpha="MD" or statealpha="NC" or statealpha="SC" or statealpha="VA" or statealpha="WV" then region=5;
    if statealpha="AL" or statealpha="KY" or statealpha="MS" or statealpha="TN" then region=6;
    if statealpha="AR" or statealpha="LA" or statealpha="OK" or statealpha="TX" then region=7;
    if statealpha="AZ" or statealpha="CO" or statealpha="ID" or statealpha="NM" or statealpha="MT" or statealpha="UT" or statealpha="NV" or statealpha="WY" then region=8;
    if statealpha="AK" or statealpha="CA" or statealpha="HI" or statealpha="OR" or statealpha="WA" then region=9;
run;

proc freq data=InfoUSA noprint;
    tables CompanyName * region / out=outcr;
run;
data outcr;
    set outcr;
    if count>=11;
run;

proc sort data=InfoUSA;
    by companyname region;
run;
data InfoUSA;
    merge InfoUSA(in=a) outcr(in=b keep=companyname region);
    by companyname region;
    if a;
    if b then elevenregion=1;
    else elevenregion=0;
run;
proc sort data=InfoUSA;
    by abi;
run;


*** Flag stores that are chain stores. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 7, invar=in7);
data listcond7alt;
    set InfoUSA;
    where companyname like 'SHOP%SAVE';
run;
data InfoUSA;
    merge InfoUSA(in=a) listcond7alt(in=b keep=abi);
    by abi;
    if a;
    if b then in7=1;
run;
data InfoUSA(drop=in7);
    set InfoUSA;
    sgnamechain=0;
    if in7 and supgroc=1 then sgnamechain=1;
    if indexw(companyname,"IGA") and supgroc=1 then sgnamechain=1;
run;

*** Convenience stores. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 8, invar=in8);
    data InfoUSA(drop=in8);
        set InfoUSA;
        convnamechain=0;
        if in8 and conv=1 then convnamechain=1;
    run;

*** Pharmacies. ;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 9, invar=in9);
data InfoUSA(drop=in9 untruepull);
    set InfoUSA;
    phnamechain=0;
    if in9 and pharm=1 then phnamechain=1;

    *** This variable culls those that should not have been pulled due to the name list. ;
    *** What that means is that we asked for SIC code 591205, and then for a name search ;
    *** to be performed for the above list of names on SIC4 5912. So if there is a record ;
    *** that has a SIC4 of 5912 but does not have a name on this list, it was not pulled as a pharmacy.;
    if phnamechain=0 & primarysic ne "591205" then untruepull=1;
    if untruepull=1 then pharm=0;
run;


*** Generate a flag genmerchmem for specific general merchandise. ;
data genmerchmem;
    set InfoUSA;
    where genmerch=1 and
    (
    companyname like "COSCO%" or
    companyname like "COSTCO%" or
    companyname like "SAM_S_CLUB" or
    companyname like "BJ'S_WHOLESALE%" or
    companyname like "BJS_WHOLESALE%"
    );
run;

data InfoUSA;
    merge InfoUSA(in=a) genmerchmem(in=b keep=abi);
    by abi;
    if a;
    if b then genmerchmem=1;
run;

*** Supermarkets are "typed" at this point, so that if in classifying other store types something was turned into a supermarket, it still gets a type. ;
proc format;
    value classif
    1="<$2 Mill, No Chain Name, <11 Stores"
    2="<$2 Mill, No Chain Name, 11+ Stores"
    3="<$2 Mill, Chain Name, <11 Stores"
    4="<$2 Mill, Chain Name, 11+ Stores"
    5="$2+ Mill, No Chain Name, <11 Stores"
    6="$2+ Mill, No Chain Name, 11+ Stores"
    7="$2+ Mill, Chain Name, <11 Stores"
    8="$2+ Mill, Chain Name, 11+ Stores";
run;
data InfoUSA;
    set InfoUSA;
    type_reg=0;
    if twomil=0 & sgnamechain=0 & elevenregion=0 then type_reg=1;
    if twomil=0 & sgnamechain=0 & elevenregion=1 then type_reg=2;
    if twomil=0 & sgnamechain=1 & elevenregion=0 then type_reg=3;
    if twomil=0 & sgnamechain=1 & elevenregion=1 then type_reg=4;
    if twomil=1 & sgnamechain=0 & elevenregion=0 then type_reg=5;
    if twomil=1 & sgnamechain=0 & elevenregion=1 then type_reg=6;
    if twomil=1 & sgnamechain=1 & elevenregion=0 then type_reg=7;
    if twomil=1 & sgnamechain=1 & elevenregion=1 then type_reg=8;
    format type_reg classif.;
run;


*** It was found that many records had PO Boxes in the address fields.;
*** Flag PO Boxes by Store Type ;
data pobox;
    set InfoUSA;
    where
    address like "PO BOX%" or
    address like "%PO BOX%" or
    address like "POST OFFICE BOX%" or
    address like "RR BOX%" or
    address like "%RR BOX%" or
    address like "RURAL ROUTE BOX%" or
    address like "HC___BOX%" or
    address like "HC____BOX%" or
    address like "HC_____BOX%" or
    strip(address) like "HCBOX%"
    ;
run;

data InfoUSA;
    merge InfoUSA(in=a) pobox(in=b keep=abi);
    by abi;
    if a;
    pobox=0;
    if b then pobox=1;
run;

title2"Total Raw Count reports - by PO Box";
title3"Overall";
proc freq data=InfoUSA;
    tables pobox ;
run;
title3"Supermarkets/Grocery";
proc freq data=InfoUSA;
    tables pobox ;
    where supgroc=1;
run;
title3"Convenience Stores";
proc freq data=InfoUSA;
    tables pobox ;
    where conv=1;
run;
title3"Pharmacies";
proc freq data=InfoUSA;
    tables pobox ;
    where pharm=1;
run;
title3"Liquor Stores";
proc freq data=InfoUSA;
    tables pobox ;
    where liquor=1;
run;
title3"General Merchandise";
proc freq data=InfoUSA;
    tables pobox ;
    where genmerch=1;
run;
title3;
title2;

    data T7R1_GeneralMerchandise T7R1_Liquor T7R1_Pharmacy T7R1_Convenience T7R1_SupermarketGrocery;
        set InfoUSA;
        if genmerch=1 then output T7R1_GeneralMerchandise;
        if liquor=1 then output T7R1_Liquor;
        if pharm=1 then output T7R1_Pharmacy;
        if conv=1 then output T7R1_Convenience;
        if supgroc=1 then output T7R1_SupermarketGrocery;
    run;


title2"Housekeeping reports: based on address cityname statealpha zip4";
title3"Duplicate Supermarkets";
proc sql;
    select address,cityname,statealpha,zip4, count(*) as n
    from InfoUSA
    where supgroc=1
    group by address,cityname,statealpha,zip4
    having count(*)>1;
quit;
title3;
title3"Duplicate General Merchandise/Grocery";
proc sql;
    select address,cityname,statealpha,zip4, count(*) as n
    from InfoUSA
    where genmerch=1
    group by address,cityname,statealpha,zip4
    having count(*)>1;
quit;
title3;
title3"Duplicate Convenience Stores";
proc sql;
    select address,cityname,statealpha,zip4, count(*) as n
    from InfoUSA
    where conv=1
    group by address,cityname,statealpha,zip4
    having count(*)>1;
quit;
title3;
title3"Duplicate Pharmacies";
proc sql;
    select address,cityname,statealpha,zip4, count(*) as n
    from InfoUSA
    where pharm=1
    group by address,cityname,statealpha,zip4
    having count(*)>1;
quit;
title3;
title3"Duplicate Liquor";
proc sql;
    select address,cityname,statealpha,zip4, count(*) as n
    from InfoUSA
    where liquor=1
    group by address,cityname,statealpha,zip4
    having count(*)>1;
quit;
title3;
title2;

title2 "Cross-Classifications between General Merchandise and Pharmacy";
proc print data=InfoUSA;
    where genmerch=1 & pharm=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Liquor";
proc print data=InfoUSA;
    where genmerch=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Convenience";
proc print data=InfoUSA;
    where genmerch=1 & conv=1;
run;
title2;
title2 "Cross-Classifications between General Merchandise and Supermarkets/Groceries";
proc print data=InfoUSA;
    where genmerch=1 & supgroc=1;
run;
title2;

title2"Stores Cross-Classified as General Merchandise and Pharmacy will (quietly) be made General Merchandise";
proc print data=InfoUSA;
    var companyname;
    where pharm=1 and genmerch=1;
run;
title2;
%ImportXLSXListIUSA(src=InfoUSA, byvar=abi, list=InfoUSA list 10, invar=in10);
data InfoUSA(drop=in10);
    set InfoUSA;
    if pharm=1 and genmerch=1 and in10 then pharm=0;
run;

title2"Stores Cross-Classified as General Merchandise and Liquor will (quietly) be made General Merchandise";
data cc2;
    set InfoUSA;
    where liquor=1 and genmerch=1 and
          (companyname like 'COSCO%' or
           companyname like 'COSTCO%');
run;
proc print data=cc2;
    var companyname;
    where liquor=1 and genmerch=1;
run;
title2;
data InfoUSA;
    merge InfoUSA(in=a) cc2(in=b keep=ABI);
    by ABI;
    if a;
    if b then liquor=0;
run;
title2;

title2 "Cross-Classifications between Pharmacies and Liquor";
proc print data=InfoUSA n;
    var companyname;
    where pharm=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between Pharmacies and Convenience";
proc print data=InfoUSA n;
    var companyname;
    where pharm=1 & conv=1;
run;
title2;
title2 "Cross-Classifications between Pharmacies and Supermarkets/Groceries";
proc print data=InfoUSA n;
    var companyname;
    where pharm=1 & supgroc=1;
run;
title2;
title2 "Cross-Classifications between Convenience and Liquor";
proc print data=InfoUSA n;
    var companyname;
    where conv=1 & liquor=1;
run;
title2;
title2 "Cross-Classifications between Convenience and Supermarkets/Groceries";
proc print data=InfoUSA n;
    var companyname;
    where conv=1 & supgroc=1;
run;
title2;
title2 "Cross-Classifications between Liquor and Supermarkets/Groceries";
proc print data=InfoUSA n;
    var companyname;
    where liquor=1 & supgroc=1;
run;
title2;


*** What are we not classifying? There are many not classified at this point, including all the PAFacilities. ;

proc format;
    value type
    0="not classified"
    1="general merchandise"
    2="liquor"
    3="pharmacy"
    4="convenient store"
    5="grocery/super market";
run;

data InfoUSA;
    set InfoUSA;
    typestore=0;
    if genmerch=1 then typestore=1;
    if liquor=1 then typestore=2;
    if pharm=1 then typestore=3;
    if conv=1 then typestore=4;
    if supgroc=1 then typestore=5;
    label typestore="Macro store type";
    format typestore type.;
run;

title2"all stores by type";
proc freq data=InfoUSA;
    tables typestore;
run;

*** In this section, individual store type files are made. This is important because deduplication is done within store types, ;
*** so they have to be separated out, first. ;

title2"Supermarket/Grocery Stores";
data SupermarketsGrocery;
    set InfoUSA;
    if supgroc=1;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2"Pharmacies";
data Pharmacies;
    set InfoUSA;
    if pharm=1;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2"Convenience Stores";
data ConvenienceStores;
    set InfoUSA;
    if conv=1;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2"Liquor Stores";
data LiquorStores;
    set InfoUSA;
    if Liquor=1;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2"General Merchandise Stores";
data GeneralMerchandise;
    set InfoUSA;
    if genmerch=1;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2"PA Facilites/Unclassified";
data Unclassified;
    set InfoUSA;
    if typestore=0;
run;
proc freq;
    tables primarysic primarysicdescription;
run;
title2;

*** In the original script, PO Box Geocodes were replaced with closest address Geocodes. ;
*** For this script, we will add on closest address Geocode information for all records, not just PO Boxes. ;
proc import datafile="&input2."
            out=igeo(keep=abi Loc_name  Match_addr X Y dist_mi dist_m TRACT18
                     rename=(abi=abin))
            dbms=xlsx
            replace;
run;
data igeo(drop=abin);
    set igeo;
    length abi $9;
    abi=put(abin,9.);

    length Loc_Name $12;
    if      Loc_name = 'PointAddress'  then Geocode_Level = '1-PointAddr';
    else if Loc_name = 'StreetAddress' then Geocode_Level = '2-StreetAddr';
    else if Loc_name = 'PostalExt'     then Geocode_Level = '4-9digitZIP';
    else if Loc_name = 'StreetName'    then Geocode_Level = '5-StreetName';
    else if Loc_name = 'Postal'        then Geocode_Level = '6-5digitZIP';
    else if Loc_name = 'AdminPlaces'   then Geocode_Level = '7-AdminPlace';
    else                                    Geocode_Level = '9-' || Loc_name;
    length MatchCode $1;
    MatchCode = substr(Geocode_Level,1,1);
run;
proc sort data=igeo;
    by abi;
run;

data SupermarketsGrocery T7R2_SupermarketsGrocery;
    merge SupermarketsGrocery(in=a) igeo;
    by abi;
    if a;
    output SupermarketsGrocery; output T7R2_SupermarketsGrocery;
run;
data Pharmacies T7R2_Pharmacies;
    merge Pharmacies(in=a) igeo;
    by abi;
    if a;
    output Pharmacies; output T7R2_Pharmacies;
run;
data ConvenienceStores T7R2_ConvenienceStores;
    merge ConvenienceStores(in=a) igeo;
    by abi;
    if a;
    output ConvenienceStores; output T7R2_ConvenienceStores;
run;
data LiquorStores T7R2_LiquorStores;
    merge LiquorStores(in=a) igeo;
    by abi;
    if a;
    output LiquorStores; output T7R2_LiquorStores;
run;
data GeneralMerchandise T7R2_GeneralMerchandise;
    merge GeneralMerchandise(in=a) igeo;
    by abi;
    if a;
    output GeneralMerchandise; output T7R2_GeneralMerchandise;
run;
data Unclassified ;
    merge Unclassified(in=a) igeo;
    by abi;
    if a;
run;


*** Finally, create output data sets for each of the six store types. ;
*** With each store type, remove duplicate rows and chain store rows. ;

*** Create output data set for General Merchandise. ;
data T7R3_GeneralMerchandise;
    set GeneralMerchandise;
    if not (matchcode in ("1","2","3","4")) then delete;
run;
data dedup1a;
    set GeneralMerchandise;
    where
    companyname like "WALMART%" or
    companyname like "SUPER_WALMART%" or
    companyname like "SUPER_WAL_MART%" or
    companyname like "WAL_MART*" ;
run;
data dedup1a;
    set dedup1a;
    length dedupbrand $ 30;
    dedupbrand="WALMART";
run;

data dedup1b;
    set GeneralMerchandise;
    where
    companyname like "KMART%" or
       companyname = "SUPERKMART" or
    companyname like "SUPER_KMART" or
       companyname = "SUPERK-MART" or
    companyname like "SUPERK-MART" or
    companyname like "SUPER_K-MART" or
    companyname like "K-MART%" ;
run;
data dedup1b;
    set dedup1b;
    length dedupbrand $ 30;
    dedupbrand="KMART";
run;

data dedup1c;
    set GeneralMerchandise;
    where
    companyname like "TARGET" or
    companyname like "SUPERTARGET" or
    companyname like "SUPER_TARGET" or
    companyname like "TARGETSUPER%" or
    companyname like "TARGET_SUPER%" ;
run;
data dedup1c;
    set dedup1c;
    length dedupbrand $ 30;
    dedupbrand="TARGET";
run;

data dedup1d;
    set GeneralMerchandise;
    where
    companyname like "SAMSCLUB" or
    companyname like "SAMS_CLUB" or
    companyname like "SAM_S_CLUB" ;
run;
data dedup1d;
    set dedup1d;
    length dedupbrand $ 30;
    dedupbrand="SAMS";
run;

data dedup1e;
    set GeneralMerchandise;
    where
    companyname like "BJ'SWHOLESALE%" or
    companyname like "BJSWHOLESALE%" or
    companyname like "BJ'S_WHOLESALE%" or
    companyname like "BJS_WHOLESALE%"
    ;
run;
data dedup1e;
    set dedup1e;
    length dedupbrand $ 30;
    dedupbrand="BJS";
run;

data dedup1; set dedup1a dedup1b dedup1c dedup1d dedup1e; run;
proc sort data=dedup1; by abi; run;
data others;
    set GeneralMerchandise;
    length brand $ 30;
    brand="";
    if companyname =: "MEIJER" then brand="MEIJER";
    else if companyname =: "COSCO" or companyname =: "COSTCO" then brand="COSTCO";
run;
data proj.GeneralMerchandise(drop=dedupbrand);
    merge others(in=a) dedup1(in=b keep=abi dedupbrand);
    by abi;
    if a;
    if b then brand=dedupbrand;
    format _all_;
run;

*** Create output data set for Convenience Stores. ;
data ConvenienceStores;
    set ConvenienceStores;
    length word1-word3 $ 24;
    word1=scan(address,1);
    word2=scan(address,2);
    word3=scan(address,3);
run;

    title3"qc: check convnamechain is not missing";
    proc freq;
    tables convnamechain;
    run;
    title3;

proc sort data=ConvenienceStores;
    by address cityname statealpha digitzipcode;
run;
proc means data=ConvenienceStores noprint;
    by address cityname statealpha digitzipcode;
    var convnamechain;
    output out=outcsa min=min_chain max=max_chain n=count;
run;
data outcsa(keep=address cityname statealpha digitzipcode count min_chain max_chain);
    set outcsa;
    if count>1;
run;
data T7R2_ConvenienceStores(drop=count);
    merge ConvenienceStores(in=a) outcsa(in=b);
    by address cityname statealpha digitzipcode;
    if a;
    if b then mult_sto=count-1;
    else mult_sto=0;
run;

data T7R3_ConvenienceStores;
    set T7R2_ConvenienceStores;
    if not (matchcode in ("1","2","3","4")) then delete;
run;

data T7R4_ConvenienceStores;
    set T7R3_ConvenienceStores;
    if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete;
run;

proc sort data=T7R4_ConvenienceStores nodupkey out=T7R5_ConvenienceStores(drop=word1 word2 word3);
    by companyname cityname statealpha digitzipcode word1 word2 word3;
run;

data proj.ConvenienceStores;
    set T7R5_ConvenienceStores;
    deleteflag=0;
    if statealpha="AK" or statealpha="HI" then delete;
    format _all_;
run;

*** Create output data set for Liquor Stores. ;
data T7R2_LiquorStores;
    set LiquorStores;
    length word1-word3 $ 24;
    word1=scan(address,1);
    word2=scan(address,2);
    word3=scan(address,3);
    word_r=ranuni(234);
run;

data T7R3_LiquorStores;
    set T7R2_LiquorStores;
    if not (matchcode in ("1","2","3","4")) then delete;
run;

*** Are there any stores duplicated at location because of multiple addresses that are exactly the same? ;
*** De-duplicate of same addresses ;
proc sort data=T7R3_LiquorStores nodupkey out=T7R4_LiquorStores;
    by address cityname statealpha digitzipcode;
run;

*** Drop duplicates with same company name and first 3 "words" of the address field, city, state, zip ;
proc sort data=T7R4_LiquorStores nodupkey out=T7R5_LiquorStores(drop=word1 word2 word3 word_r);
    by companyname cityname statealpha digitzipcode word1 word2 word3;
run;
data proj.LiquorStores;
    set T7R5_LiquorStores;
    if statealpha="AK" or statealpha="HI" then delete;
    format _all_;
run;

*** Create output data set for Pharmacies. ;
data Pharmacies;
    set Pharmacies;
    length word1-word3 $ 24;
    word1=scan(address,1);
    word2=scan(address,2);
    word3=scan(address,3);
run;

    title3"qc: check phnamechain is not missing";
    proc freq;
    tables phnamechain;
    run;
    title3;

proc sort data=Pharmacies;
    by address cityname statealpha digitzipcode;
run;
proc means data=Pharmacies noprint;
    by address cityname statealpha digitzipcode;
    var phnamechain;
    output out=outpa min=min_chain max=max_chain n=count;
run;
data outpa(keep=address cityname statealpha digitzipcode count min_chain max_chain);
    set outpa;
    if count>1;
run;
data T7R2_Pharmacies(drop=count);
    merge Pharmacies(in=a) outpa(in=b);
    by address cityname statealpha digitzipcode;
    if a;
    if b then mult_sto=count-1;
    else mult_sto=0;
run;

data T7R3_Pharmacies;
    set T7R2_Pharmacies;
    if not (matchcode in ("1","2","3","4")) then delete;
run;

data T7R4_Pharmacies;
    set T7R3_Pharmacies;
    if mult_sto>0 and min_chain=0 and max_chain=1 and sgnamechain=0 then delete;
run;

proc sort data=T7R4_Pharmacies nodupkey out=T7R5_Pharmacies(drop=word1 word2 word3);
    by companyname cityname statealpha digitzipcode word1 word2 word3;
run;

data proj.Pharmacies;
    set T7R5_Pharmacies;
    if statealpha="AK" or statealpha="HI" then delete;
    format _all_;
run;

*** Create output data set for Supermarkets and Grocery Stores. ;
data SupermarketsGrocery;
    set SupermarketsGrocery;
    length word1-word3 $ 24;
    word1=scan(address,1);
    word2=scan(address,2);
    word3=scan(address,3);
run;

    title3"qc: check phnamechain is not missing";
    proc freq;
    tables phnamechain;
    run;
    title3;

proc sort data=SupermarketsGrocery;
    by address cityname statealpha digitzipcode;
run;
proc means data=SupermarketsGrocery noprint;
    by address cityname statealpha digitzipcode;
    var type_reg;
    output out=outsga min=min_chain max=max_chain n=count;
run;
data outsga(keep=address cityname statealpha digitzipcode count min_chain max_chain);
    set outsga;
    if count>1;
run;
data T7R2_SupermarketsGrocery(drop=count);
    merge SupermarketsGrocery(in=a) outsga(in=b);
    by address cityname statealpha digitzipcode;
    if a;
    if b then mult_sto=count-1;
    else mult_sto=0;
run;

data T7R3_SupermarketsGrocery;
    set T7R2_SupermarketsGrocery;
    if not (matchcode in ("1","2","3","4")) then delete;
run;

data T7R4_SupermarketsGrocery;
    set T7R3_SupermarketsGrocery;
    if min_chain<max_chain and not missing(min_chain) and type_reg<max_chain then delete;
run;

proc sort data=T7R4_SupermarketsGrocery nodupkey out=T7R5_SupermarketsGrocery(drop=word1 word2 word3);
    by companyname cityname statealpha digitzipcode word1 word2 word3;
run;

data proj.SupermarketsGrocery;
    set T7R5_SupermarketsGrocery;
    if statealpha="AK" or statealpha="HI" then delete;
    format _all_;
run;

*** Create output data set for Unclassified ;
data proj.Unclassified;
    set Unclassified;
    format _all_;
run;




title2"Count Summary Table Row 1";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,5) eq "T7R1_";
run;
title2;

title2"Count Summary Table Row 2";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,5) eq "T7R2_";
run;
title2;

title2"Count Summary Table Row 3";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,5) eq "T7R3_";
run;
title2;

title2"Count Summary Table Row 4";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,5) eq "T7R4_";
run;
title2;

title2"Count Summary Table Row 5";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,5) eq "T7R5_";
run;
title2;

title2"Count Summary Table Row 6";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="PROJ" and memname ne "FF_CLASS";
run;
title2;