/* This program was developed by Westat, Inc. under NCI contract HHSN261201800002B*/
/* Version 1.0, copyright 2020 */
/* If you publish results based on these programs, please include the following citation:
    Dun and Bradstreet Measure Construction, Version 1.0 – December 2020;
    Division of Cancer Control and Population Sciences, National Cancer Institute.
*/

/* The program is distributed under the terms of the GNU General Public License:

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details <http://www.gnu.org/licenses/>.
*/
/**********************************************************************************************************
PROGRAM:
   #02_ImportDunBradstreet.sas

OBJECTIVE:
   Measure Construction for Dun & Bradstreet data.
   This program follows the original STATA script in the appendix of the documentation
   for the Weight and Veterans' Environments Study.

INPUT:
    RetailerNameLists.xlsx
        - Excel workbook with list of business names used in the original STATA script. The business names
          were originally coded directly into the STATA script; by storing the business names in Excel
          we can more easily add/delete business names as well improve readability of the SAS program.
    WESTAT_HISTORIC_OUTPUT_AUG2020.xlsx
        - Excel file with Colorado data provided by Dun & Bradstreet.
    dnb_geo.xlsx
        - Excel file with geocode information for each record in the Dun & Bradstreet file.

OUTPUT:
    FF_Class.sas7bdat
    FFR_ForGrid.sas7bdat

Note: Script will need to be adjusted with project-specific file names and locations
**********************************************************************************************************/

options mprint mlogic;
libname proj "FILEPATH";

    %macro ImportXLSXListDB(src=,byvar=,list=,invar=);
    **** This macro is used to import the list of business names from a single worksheet from the RetailerNameLists.xlsx workbook. ;
    **** The business names in the Dun & Bradstreet worksheet are formatted to consider multiple matching conditions ;
    **** from the original STATA script. After import, the list of business is compared to the Dun & Bradstreet source data set and
    **** a variable, &invar, is created to represent whether the Dun & Bradstreet business name meets ;
    **** any of the matching conditions of the business name from the Excel sheet. ;

    *** Import a specified Excel worksheet of business names. ;
    proc import datafile="FILEPATH\RetailerNameLists.xlsx"
                out=rlist
                dbms=xlsx
                replace;
                sheet="&list.";
    run;

    *** Convert STATA syntax to SAS syntax. ;
    data rlist;
        length retailer $50;
        set rlist;
        retailer=translate(retailer,'%','*');
        retailer=translate(retailer,'_','?');
        obs=_N_;
    run;

    *** Build lines of code using the business names. ;
    %let wherestring1=;
    %let wherestring2=;
    proc sql noprint ;
        select    'bn like "' || left(trim(retailer)) || '" or tsn like "' || left(trim(retailer)) || '" ' into :wherestring1 separated by ' or '
        from rlist where obs<=90;
        select 'or bn like "' || left(trim(retailer)) || '" or tsn like "' || left(trim(retailer)) || '" ' into :wherestring2 separated by ' '
        from rlist where obs>90;
    quit;

    *** Starting with the Dun & Bradstreet source data, create a subset data set that contains records that ;
    *** company names that match the business names from the Excel list. ;
    proc sort data=&src.;
        by &byvar.;
    run;
    data listcond;
        set &src.(rename=(tradestylename=tsn businessname=bn));
        where
            &wherestring1.
            &wherestring2.
        ;
    run;

    *** Merge the subset of matching business names back to the source Dun & Bradstreet data set.;
    *** Create a binary indicator to show whether or not a given company successfully matched with ;
    *** a business name from the Excel sheet.;
    data &src.;
        merge &src.(in=a) listcond(in=b keep=&byvar.);
        by &byvar.;
        if a;
        if b then &invar.=1;
        else &invar.=0;
    run;

    %mend ImportXLSXListDB;


****** Input files ;
****** Original file from Dun & Bradstreet ;
%let input1 = FILEPATH\WESTAT_HISTORIC_OUTPUT_AUG2020.xlsx ;
****** File with geocode information ;
%let input2 = FILEPATH\dnb_geo.xlsx ;

proc import datafile="&input1."
            out=DunBradstreet(rename=(DUNS=dunsnumber))
            dbms=xlsx
            replace;
run;

proc sort data=DunBradstreet;
    by dunsnumber;
run;

**** SIC code values can be longer than eight digits, so we will keep only first eight digits. ;
data DunBradstreet(rename=(DTRADE=tradestylename DCOMP=businessname));
    set DunBradstreet;
    primarysic = input(DPRIMSI || DSICEXT1, 8.);

    prim8digit=primarysic;

    if length(put(prim8digit,8.))=8 then only8=1; else only8=0;

    length prim8digits $8;
    prim8digits=DPRIMSI || DSICEXT1;
run;

data T7R1DB_;
    set DunBradstreet;
run;

*** Generate count of duplicates;
proc freq data=DunBradstreet noprint;
    tables dunsnumber / out=outid;
run;
data outid;
    set outid;
    duplicateduns=count-1;
run;

data DunBradstreet;
    merge DunBradstreet(in=a) outid(keep=dunsnumber duplicateduns);
    by dunsnumber;
run;

*** Generate storetype variable values based on lists ;
%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 1, invar=in1);
data DunBradstreet(drop=in1);
    set DunBradstreet;
    if in1 then storetype=1;
run;

%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 2, invar=in2);
data DunBradstreet(drop=in2);
    set DunBradstreet;
    if in2 then storetype=2;
run;

%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 3, invar=in3);
data DunBradstreet(drop=in3);
    set DunBradstreet;
    if in3 then storetype=3;
run;

%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 4, invar=in4);
data DunBradstreet(drop=in4);
    set DunBradstreet;
    if in4 then storetype=4;
run;

%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 5, invar=in5);
data DunBradstreet(drop=in5);
    set DunBradstreet;
    if in5 then storetype=5;
run;

*** Generate storetype variable values based on SIC code;
data DunBradstreet;
    set DunBradstreet;
    if prim8digit=58120601 then storetype=6;
    else if prim8digit in (58120602,58120600) then storetype=7;
    else if prim8digit=58120304 then storetype=8;
    else if missing(storetype) then storetype=9;
run;

title2"Dun & Bradstreet file: distribution of storetype";
proc freq;
    tables storetype;
run;
title2;

*** Drop convenience stores and full-service restaurants;
%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 6, invar=in6);
data DunBradstreet(drop=in6);
    set DunBradstreet;
    if storetype>6 and in6 then conv=1;
    else conv=0;
run;

%ImportXLSXListDB(src=DunBradstreet, byvar=dunsnumber, list=Dun Bradstreet list 7, invar=in7);
data DunBradstreet(drop=in7);
    set DunBradstreet;
    if storetype>6 and in7 then fullservice=1;
    else fullservice=0;
run;

data DunBradstreet;
    set DunBradstreet;
    by dunsnumber;
    if conv=1 then delete;
    if fullservice=1 then delete;
run;

data T7R2DB_;
    set DunBradstreet;
run;

title2"Dun & Bradstreet file: distribution of storetype after dropping Convenience and Full Service";
proc freq;
    tables storetype;
run;
title2;

proc format;
    value FFRLabels
    1="Top 5 Non-Sandwich, Non-Coffee Chains"
    2="Subway"
    3="Top Chains, Excluding Top 5, Coffeeshops, and Pizza"
    4="Top Chain CoffeeShops"
    5="Large Chain Pizza"
    6="Chain Pizza, SIC: 58120601"
    7="Non-Chain Pizza, SIC: 58120602 or 58120600"
    8="Non-Chain CoffeeShops, SIC: 58120304"
    9="FFR, Not Elsewhere Classified";
run;

**** Add geocode information to the file. ;
proc import datafile="&input2."
            out=dgeo(keep=duns Loc_name Match_addr X Y TRACT18)
            dbms=xlsx
            replace;
run;
data dgeo;
    set dgeo;
    length dunsnumber $9;
    dunsnumber=put(duns,Z9.);
run;
proc sort data=dgeo(drop=duns);
    by dunsnumber;
run;

data DunBradstreet;
    merge DunBradstreet(in=a) dgeo(in=b);
    by dunsnumber;
    if a;
    format storetype FFRLabels.;
run;

**** Delete rows who have insufficient geocode accuracy. ;
data DunBradstreet;
    set DunBradstreet;

    length Loc_Name $12;
    if      Loc_name = 'PointAddress'  then Geocode_Level = '1-PointAddr';
    else if Loc_name = 'StreetAddress' then Geocode_Level = '2-StreetAddr';
    else if Loc_name = 'PostalExt'     then Geocode_Level = '4-9digitZIP';
    else if Loc_name = 'StreetName'    then Geocode_Level = '5-StreetName';
    else if Loc_name = 'Postal'        then Geocode_Level = '6-5digitZIP';
    else if Loc_name = 'AdminPlaces'   then Geocode_Level = '7-AdminPlace';
    else                                    Geocode_Level = '9-' || Loc_name;
    length MatchCode $1;
    MatchCode = substr(Geocode_Level,1,1);

    if not (matchcode in ("1","2","3","4")) then delete;
run;

data T7R3DB_;
    set DunBradstreet;
run;

**** Delete Alaska & Hawaii rows and save Dun & Bradstreet file permanently. ;
data proj.FF_Class;
    set DunBradstreet;
    if DSTATEAB in ("AK","HI") then delete;
    format _all_;
run;

**** Add count variable necessary for grids generation and save file permanently. ;
data proj.FFR_ForGrid;
    set proj.FF_Class;
    count=1;
run;


title2"Count Summary Table Row 1: Counts";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,7) eq "T7R1DB_";
run;
title2;

title2"Count Summary Table Row 2: Counts";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,7) eq "T7R2DB_";
run;
title2;

title2"Count Summary Table Row 3: Counts";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="WORK" and substr(memname,1,7) eq "T7R3DB_";
run;
title2;

title2"Count Summary Table Row 6: Final counts";
proc print data=sashelp.vtable;
    var libname memname nobs;
    where libname="PROJ" and memname eq "FF_CLASS";
run;
title2;