Skip to content

Commit

Permalink
Merge pull request #74 from worldbank/v1.1
Browse files Browse the repository at this point in the history
V1.1
  • Loading branch information
kbjarkefur authored May 23, 2019
2 parents 8f539e3 + aca09b4 commit 4aada18
Show file tree
Hide file tree
Showing 13 changed files with 521 additions and 92 deletions.
2 changes: 1 addition & 1 deletion src/ado_files/iecodebook.ado
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*! version 1.0 31JAN2018 DIME Analytics [email protected]
*! version 1.1 20MAY2019 DIME Analytics [email protected]

// Main syntax *********************************************************************************

Expand Down
2 changes: 1 addition & 1 deletion src/ado_files/iecompdup.ado
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*! version 1.0 31JAN2019 DIME Analytics [email protected]
*! version 1.1 20MAY2019 DIME Analytics [email protected]

capture program drop iecompdup
program iecompdup , rclass
Expand Down
224 changes: 170 additions & 54 deletions src/ado_files/ieduplicates.ado

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/ado_files/iefieldkit.ado
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
*! version 1.0 31JAN2018 DIME Analytics [email protected]
*! version 1.1 20MAY2019 DIME Analytics [email protected]

capture program drop iefieldkit
program iefieldkit, rclass

* UPDATE THESE LOCALS FOR EACH NEW VERSION PUBLISHED
local version "1.0"
local versionDate "31JAN2019"
local version "1.1"
local versionDate "20MAY2019"

syntax [anything]

Expand Down
22 changes: 16 additions & 6 deletions src/ado_files/ietestform.ado
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*! version 1.0 31JAN2018 DIME Analytics [email protected]
*! version 1.1 20MAY2019 DIME Analytics [email protected]

capture program drop ietestform
program ietestform , rclass
Expand Down Expand Up @@ -490,7 +490,7 @@ qui {

*Variables that must be included every time
local name_vars "name"
local cmd_vars "type required" // Include as needed eventually. "readonly appearance"
local cmd_vars "type required appearance" // Include as needed eventually. "readonly"
local msg_vars "`labelvars'"
local code_vars "" // Include as needed eventually. " constraint relevance calculation repeat_count choice_filter"

Expand Down Expand Up @@ -619,8 +619,8 @@ qui {
inlist(type, "start", "end", "deviceid", "subscriberid", "simserial", "phonenumber", "username", "caseid") | /// Default meta types doen not need to be required
missing(type)) /// Rows that are not fields shold be skipped

*List and output non-note fields that are not required
gen nonnote_nonrequired = (req_relevant == 1 & type != "note" & lower(required) != "yes")
*List and output non-note, non-label fields that are not required
gen nonnote_nonrequired = (req_relevant == 1 & type != "note" & appearance != "label" & lower(required) != "yes")
count if nonnote_nonrequired == 1
if `r(N)' > 0 {

Expand All @@ -639,6 +639,16 @@ qui {
noi report_file add , report_tempfile("`report_tempfile'") wikifragment("Required_Column") message("`error_msg'") table("list row type name if note_required == 1") testname("REQUIRED NOTE TYPE FIELD")
}

*List and output required label fields in field-list groups
gen label_required = (req_relevant == 1 & appearance == "label" & lower(required) == "yes")
count if label_required == 1
if `r(N)' > 0 {

*Prepare message and write it
local error_msg "Fields with appearance [label] (inside a field-list group) must not be required. Label fields are currently required in the following rows:"
noi report_file add , report_tempfile("`report_tempfile'") wikifragment("Required_Column") message("`error_msg'") table("list row type name if label_required == 1") testname("REQUIRED LABEL FIELD")
}

}
end

Expand Down Expand Up @@ -740,7 +750,7 @@ qui {
local beginrow = subinstr("`beginrow'","#","", 1) //Remove the parse char "#"

*If the name are not the same it is most likely a different group or repeat group that is incorrectly being closed
if "`endname'" != "`beginname'" {
if "`endname'" != "`beginname'" & !missing("`endname'") {

local error_msg "begin_`begintype' [`beginname'] on row `beginrow' and end_`endtype' [`endname'] on row `endrow'"

Expand Down Expand Up @@ -960,7 +970,7 @@ qui {

local error_msg "These variable names are longer then 32 characters. That is allowed in the data formats used in SurveyCTO - and is therefore allowed in their test - but will cause an error when the data is imported to Stata. The following names should be shortened:"

noi report_file add , report_tempfile("`report_tempfile'") testname("TOO LONG FIELD NAMES") message("`error_msg'") wikifragment("Field_Name_Length") able("list row type name if longname == 1")
noi report_file add , report_tempfile("`report_tempfile'") testname("TOO LONG FIELD NAMES") message("`error_msg'") wikifragment("Field_Name_Length") table("list row type name if longname == 1")

}

Expand Down
2 changes: 1 addition & 1 deletion src/help_files/iecodebook.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* 31 Jan 2019}{...}
{* 20 May 2019}{...}
{hline}
help for {hi:iecodebook}
{hline}
Expand Down
2 changes: 1 addition & 1 deletion src/help_files/iecompdup.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* 31 Jan 2019}{...}
{* 20 May 2019}{...}
{hline}
help for {hi:iecompdup}
{hline}
Expand Down
99 changes: 76 additions & 23 deletions src/help_files/ieduplicates.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* 31 Jan 2019}{...}
{* 20 May 2019}{...}
{hline}
help for {hi:ieduplicates}
{hline}
Expand All @@ -20,6 +20,12 @@ command please see the {browse "https://dimewiki.worldbank.org/wiki/Ieduplicates
, {cmdab:fol:der(}{it:string}{cmd:)} {cmdab:unique:vars(}{it:varlist}{cmd:)}
[{cmdab:keep:vars(}{it:varlist}{cmd:)} {cmdab:tostringok} {cmdab:droprest}
{cmdab:nodaily} {cmdab:suf:fix(}{it:string}{cmd:)}
{cmdab:duplistid(}{it:string}{cmd:)} {cmdab:datelisted(}{it:string}{cmd:)}
{cmdab:datefixed(}{it:string}{cmd:)} {cmdab:correct(}{it:string}{cmd:)}
{cmdab:drop(}{it:string}{cmd:)} {cmdab:newid(}{it:string}{cmd:)}
{cmdab:initials(}{it:string}{cmd:)} {cmdab:notes(}{it:string}{cmd:)}]{p_end}



{phang2}where {it:ID_varname} is the variable that will be controlled for duplicates

Expand All @@ -32,8 +38,23 @@ command please see the {browse "https://dimewiki.worldbank.org/wiki/Ieduplicates
{synopt :{cmdab:keep:vars(}{it:varlist}{cmd:)}}variables used to be included in the Excel report in addition to {it:ID_varname} and {cmdab:unique:vars()} {p_end}
{synopt :{cmdab:tostringok}}allows {it:ID_varname} to be recasted to string if required{p_end}
{synopt :{cmdab:droprest}}disables the requirement that duplicates must be explicitly deleted{p_end}
{synopt :{cmdab:suf:fix(}{it:string}{cmd:)}}allows the user to add a suffix to the filename of the Excel report{p_end}
{synopt :{cmdab:nodaily}}disables daily back-up copies of the Excel report{p_end}
{synopt :{cmdab:suf:fix(}{it:string}{cmd:)}}allows the user to add a suffix to the filename of the Excel report{p_end}

{pstd}{it: {ul:{hi:Excel variable name options:}}}{p_end}

{pstd}This option allows users to customize the column names in the report Excel spreadsheet. This option is intended for situations when the dataset already has variable(s) named the same as the default Excel spreadsheet name. {p_end}

{synopt :{cmdab:duplistid(}{it:string}{cmd:)}}customizes variable {it:duplistid}{p_end}
{synopt :{cmdab:datelisted(}{it:string}{cmd:)}}customizes variable {it:datelisted}{p_end}
{synopt :{cmdab:datefixed(}{it:string}{cmd:)}}customizes variable {it:datefixed}{p_end}
{synopt :{cmdab:correct(}{it:string}{cmd:)}}customizes variable {it:correct}{p_end}
{synopt :{cmdab:drop(}{it:string}{cmd:)}}customizes variable {it:drop}{p_end}
{synopt :{cmdab:newid(}{it:string}{cmd:)}}customizes variable {it:newid}{p_end}
{synopt :{cmdab:initials(}{it:string}{cmd:)}}customizes variable {it:initials}{p_end}
{synopt :{cmdab:notes(}{it:string}{cmd:)}}customizes variable {it:notes}{p_end}


{synoptline}

{title:Description}
Expand All @@ -55,11 +76,11 @@ needs to be specified in order to have a unique reference for each row in the Ex
observations in the Excel report, either on its own or together with {it:ID_varname}. {cmd:ieduplicates}
then returns the data set without these duplicates.

{pstd}The Excel report includes three columns called {it:correct}, {it:drop} and {it:newID}.
{pstd}The Excel report includes three columns called {it:correct}, {it:drop} and {it:newid}.
Each of them represents one way to correct the duplicates. If {it:correct} is indicated with
a "Yes" then that observation is kept unchanged, if {it:drop} is indicated with a "Yes" then
that observation is deleted and if {it:newID} is indicated then that observation is assigned
a new ID using the value in column {it:newID}. After corrections are entered, the report should
that observation is deleted and if {it:newid} is indicated then that observation is assigned
a new ID using the value in column {it:newid}. After corrections are entered, the report should
be saved in the same location {cmdab:fol:der(}{it:string}{cmd:)} without any changes to its name.

{pstd}Before outputting a new report {cmd:ieduplicates} always checks if there already is an
Expand Down Expand Up @@ -99,15 +120,15 @@ drop and assign a new ID to. For data integrity reasons, be careful not to expor
Excel files including both identifying variables and names together with {it:ID_varname}.

{phang}{cmdab:tostringok} allows {it:ID_varname} to be turned into a string variable in case
{it:ID_varname} is numeric but a value listed in {it:newID} is non-numeric. Otherwise an error is generated.
{it:ID_varname} is numeric but a value listed in {it:newid} is non-numeric. Otherwise an error is generated.

{phang}{cmdab:droprest} disables the requirement that duplicates must be explicitly deleted.
The default is that if one of the duplicates in a group of duplicates has a
correction, then that correction is only valid if all other duplicates in that
group have a correction as well. For example, if there are four observations with
the same value for {it:ID_varname} and one is correct, one needs a new ID and
two are incorrect and should be deleted. Then the first one is indicated to be
kept in the {it:correct} column, the second one is given a new ID in {it:newID}
kept in the {it:correct} column, the second one is given a new ID in {it:newid}
and the other two observations must be indicated for deletion in {it:drop}
unless {cmdab:droprest}. The first two corrections are not considered valid and
will cause an error in case if {cmdab:droprest} is not specified and the other
Expand All @@ -132,6 +153,16 @@ report in a sub-folder called Daily in the folder specified in {cmdab:folder()}.
the folder /Daily/ does not exist, then it is created unless the
option {cmdab:nodaily} is used.

{title:Excel variable name options:}

{phang}{cmdab:duplistid(}{it:string}{cmd:)} {cmdab:datelisted(}{it:string}{cmd:)}
{cmdab:datefixed(}{it:string}{cmd:)} {cmdab:correct(}{it:string}{cmd:)}
{cmdab:drop(}{it:string}{cmd:)} {cmdab:newid(}{it:string}{cmd:)}
{cmdab:initials(}{it:string}{cmd:)} {cmdab:notes(}{it:string}{cmd:)}
allow the user to set a unique name for each default variable names (e.g. {it:duplistid}, {it:datelisted}, etc.) in the Excel report spreadsheet.
This is meant to be used when the variable name already exists in the dataset. To avoid error, the command offers a way to modify the variable name in the Excel Report spreadsheet. {p_end}


{title:The Excel Report}

{pstd}A report of duplicates will be created in {cmdab:fol:der(}{it:string}{cmd:)}
Expand All @@ -142,40 +173,40 @@ that day, then that report will be overwritten.

{pstd}All duplicates in a group of duplicates must have a correction indicated. If
one or more duplicates are indicated as correct in {it:correct} or assigned a new
ID in {it:newID}, then all other duplicates with the same value in {it:ID_varname} must
ID in {it:newid}, then all other duplicates with the same value in {it:ID_varname} must
be explicitly indicated for deletion. This requirement may (but probably
shouldn't) be disabled by option {cmdab:droprest}.

{dlgtab:Columns in Excel Report filled in automatically:}

{phang}{it:dupListID} stores an auto incremented duplicate list ID that is used
{phang}{it:duplistid} stores an auto incremented duplicate list ID that is used
to maintain the sort order in the Excel Report regardless of how the data in memory
is sorted at the time {cmd:ieduplicates} is executed.

{phang}{it:dateListed} stores the date the duplicate was first identified.
{phang}{it:datelisted} stores the date the duplicate was first identified.

{phang}{it:dateFixed} stores the date a valid correction was imported the first
{phang}{it:datefixed} stores the date a valid correction was imported the first
time for that duplicate.

{dlgtab:Columns in Excel Report to be filled in manually by a user:}

{phang}{it:correct} is used to indicate that the duplicate should be kept. Valid values are
restricted to "yes" and "y" to reduce the risk of unintended entries. The values
are not sensitive to case. All valid values are changed to "yes" lower case when
imported. If {it:correct} is indicated then both {it:drop} and {it:newID} must be
imported. If {it:correct} is indicated then both {it:drop} and {it:newid} must be
left empty.

{phang}{it:drop} is used to indicate that the duplicate should be deleted. Valid values are
restricted to "yes" and "y" to reduce the risk of unintended entries. The values
are not sensitive to case. All valid values are changed to "yes" lower case when
imported. If {it:drop} is indicated then both {it:correct} and {it:newID} must be
imported. If {it:drop} is indicated then both {it:correct} and {it:newid} must be
left empty.

{phang}{it:newID} is used to assign a new ID values to a duplicate. If {it:ID_varname}
is a string then all values are valid for {it:newID}. If {it:ID_varname} is numeric then
{phang}{it:newid} is used to assign a new ID values to a duplicate. If {it:ID_varname}
is a string then all values are valid for {it:newid}. If {it:ID_varname} is numeric then
only digits are valid, unless the option {cmdab:tostringok} is specified.
If {cmdab:tostringok} is specified and {it:newID} is non-numeric, then {it:ID_varname}
is recasted to a string variable. If {it:newID} is indicated then both {it:correct} and {it:drop} must be
If {cmdab:tostringok} is specified and {it:newid} is non-numeric, then {it:ID_varname}
is recasted to a string variable. If {it:newid} is indicated then both {it:correct} and {it:drop} must be
left empty.

{phang}{it:initials} allows the team working with this data to keep track on who
Expand Down Expand Up @@ -245,7 +276,7 @@ unresolved duplicates were found
{hi:Example 4.} Using the Excel file. The table below could be the report generated in Example 2 above. Make the viewer window wider and reload the page if the table below does not display properly!

{col 3}{c TLC}{hline 116}{c TRC}
{col 3}{c |}{col 4}HHID{col 10}dupListID{col 21}dateListed{col 33}dateFixed{col 44}correct{col 53}drop{col 59}newID{col 65}initials{col 75}note{col 94}KEY{col 107}enumerator{col 120}{c |}
{col 3}{c |}{col 4}HHID{col 10}duplistid{col 21}datelisted{col 33}datefixed{col 44}correct{col 53}drop{col 59}newid{col 65}initials{col 75}notes{col 94}KEY{col 107}enumerator{col 120}{c |}
{col 3}{c LT}{hline 116}{c RT}
{col 3}{c |}{col 4}4321{col 10}1{col 21}27Dec2015{col 33}02Jan2016{col 44}yes{col 53} {col 59} {col 65}KB{col 75}double submission{col 94}{it:uniquevalue}{col 107}{it:keepvarvalue}{col 120}{c |}
{col 3}{c |}{col 4}4321{col 10}2{col 21}27Dec2015{col 33}02Jan2016{col 44} {col 53}yes{col 59} {col 65}KB{col 75}double submission{col 94}{it:uniquevalue}{col 107}{it:keepvarvalue}{col 120}{c |}
Expand All @@ -260,19 +291,41 @@ unresolved duplicates were found
{pmore}The table above shows an example of an Excel report with 4 duplicates groups with
two duplicates in each groups. The duplicates in 4321 and in 1145 have both been corrected
but 7365 and 9834 are still unresolved. Before any observation was corrected, all observations had
{it:dateFixed}, {it:correct}, {it:drop}, {it:newID}, {it:initials} and {it:note} empty just like the observations for ID 7365 and 9834. {it:dateFixed}
{it:datefixed}, {it:correct}, {it:drop}, {it:newid}, {it:initials} and {it:note} empty just like the observations for ID 7365 and 9834. {it:datefixed}
is not updated by the user, the command adds this date the first time the correction is made.

{pmore}Observation with dupListID == 5 was found to have been
assigned the incorrect ID while the data was collected. This observation is assigned the correct ID in {it:newID}
and observation dupListID == 6 is indicated to be correct. Someone with initials IB made this
{pmore}Observation with duplistid == 5 was found to have been
assigned the incorrect ID while the data was collected. This observation is assigned the correct ID in {it:newid}
and observation duplistid == 6 is indicated to be correct. Someone with initials IB made this
correction and made a note. This note can and should be more descriptive but is kept short in this example.

{pmore}Observations with dupListID == 1 and dupListID == 2 were identified as a duplicate submissions of the same
{pmore}Observations with duplistid == 1 and duplistid == 2 were identified as a duplicate submissions of the same
observation. One is kept and one is dropped, usually it does not matter which you keep and which you drop, but that should be confirmed.

{pmore}Both corrections described in the example would have been easily identified using this command's sister command {help iecompdup}.



{phang}
{hi:Example 5.} {inp:ieduplicates HHID, folder(C:\myImpactEvaluation\baseline\data) uniquevars(KEY) drop(out) notes(notes_enumerators)}

{col 3}{c TLC}{hline 103}{c TRC}
{col 3}{c |}{col 4}HHID{col 10}duplistid{col 21}datelisted{col 33}datefixed{col 44}correct{col 53}out{col 59}newid{col 65}initials{col 75}notes_enumerators{col 94}KEY{col 107}{c |}
{col 3}{c LT}{hline 103}{c RT}
{col 3}{c |}{col 4}4321{col 10}1{col 21}27Dec2015{col 33}02Jan2016{col 44}yes{col 53} {col 59} {col 65}KB{col 75}double submission{col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}4321{col 10}2{col 21}27Dec2015{col 33}02Jan2016{col 44} {col 53}yes{col 59} {col 65}KB{col 75}double submission{col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}7365{col 10}3{col 21}03Jan2016{col 33} {col 44} {col 53} {col 59} {col 65} {col 75} {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}7365{col 10}4{col 21}03Jan2016{col 33} {col 44} {col 53} {col 59} {col 65} {col 75} {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}1145{col 10}5{col 21}03Jan2016{col 33}11Jan2016{col 44} {col 53} {col 59}1245{col 65}IB{col 75}incorrect id {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}1145{col 10}6{col 21}03Jan2016{col 33}11Jan2016{col 44}yes{col 53} {col 59} {col 65}IB{col 75}correct id {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}9834{col 10}7{col 21}11Jan2016{col 33} {col 44} {col 53} {col 59} {col 65} {col 75} {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c |}{col 4}9834{col 10}8{col 21}11Jan2016{col 33} {col 44} {col 53} {col 59} {col 65} {col 75} {col 94}{it:uniquevalue}{col 107}{c |}
{col 3}{c BLC}{hline 103}{c BRC}

{pmore} The variable names in Excel Report is now changed to the user speficied. If the user changed any of the variable names in the Excel Report, when importing the Excel file back to apply the decisions, run exactly the same code:{p_end}
{pmore}{inp:ieduplicates HHID, folder(C:\myImpactEvaluation\baseline\data) uniquevars(KEY) drop(out) notes(notes_enumerators)}{p_end}


{title:Acknowledgements}

{phang}We would like to acknowledge the help in testing and proofreading we received in relation to this command and help file from (in alphabetic order):{p_end}
Expand Down
2 changes: 1 addition & 1 deletion src/help_files/iefieldkit.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* 31 Jan 2019}{...}
{* 20 May 2019}{...}
{hline}
help for {hi:iefieldkit}
{hline}
Expand Down
2 changes: 1 addition & 1 deletion src/help_files/ietestform.sthlp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{smcl}
{* 31 Jan 2019}{...}
{* 20 May 2019}{...}
{hline}
help for {hi:ietestform}
{hline}
Expand Down
Loading

0 comments on commit 4aada18

Please sign in to comment.