Code for Social Housing module: Tenancy Spells

tori.vanloenhout
10 November 2024

Social Housing Tenancy Spells Code

/* Set Parameters */

/*PARAMETERS
SQLCMD only (Activate by clicking Query->SQLCMD Mode)
Already in master.sql; Uncomment when running individually 
*/

:setvar idicleanversion "{idicleanversion}" 
:setvar targetdb "{targetdb}"
:setvar targetschema "{targetschema}"
:setvar yyyymm "{yyyymm}"

/*
:setvar idicleanversion "IDI_Clean_202406" 
:setvar targetdb "IDI_Sandpit"
:setvar targetschema "DL-MAA2020-47"
:setvar yyyymm "202406"
*/

/* Assign the target database to which all the components need to be created in. */
USE IDI_UserCode;
GO

/* Create a single household identifier from legagcy, Housing New Zealand and MSD identifiers */
/* Get all HNZ household IDs */
drop table if exists #household_id1;
select distinct
snz_household_uid
,iif(snz_legacy_household_uid = 1, null, snz_legacy_household_uid) as snz_legacy_household_uid
into #household_id1
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where hnz_ts_snapshot_date <= '2015-08-31'
/* Add in add records captured in adhoc data only */
union 
select 
snz_household_uid
,iif(snz_legacy_household_uid = 1, null, snz_legacy_household_uid) as snz_legacy_household_uid
from [IDI_Adhoc].[clean_read_HNZ].[adhoc_clean_tenancy_snapshot]
where hnz_ts_snapshot_date <= '2010-10-31' and snz_legacy_household_uid is not null;

/* Match all IDS */
drop table if exists #household_id2;
with matched_leg_id as (
select 
* 
from #household_id1 
where snz_household_uid is not null and snz_legacy_household_uid is not null
)
select distinct
coalesce(a.snz_household_uid, c.snz_household_uid) as snz_household_uid
,coalesce(a.snz_legacy_household_uid, b.snz_legacy_household_uid) as snz_legacy_household_uid
into #household_id2
from #household_id1 a
left join matched_leg_id b on a.snz_household_uid = b.snz_household_uid 
left join matched_leg_id c on a.snz_legacy_household_uid = c.snz_legacy_household_uid;

/* Get households from MSD data */
drop table if exists #msd_first_snapshot;
select
snz_household_uid 
,snz_msd_application_uid
,snz_uid as msd_pt
,min(isnull(msd_tenancy_start_date,'3999-12-31')) as msd_entry_date
,min(isnull(hnz_ts_snapshot_date,'3999-12-31')) as msd_min_snapshot
,max(isnull(hnz_ts_snapshot_date,'1900-01-01')) as msd_max_snapshot
into #msd_first_snapshot
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] a
where hnz_ts_snapshot_date > '2015-08-31'
group by snz_household_uid,snz_msd_application_uid,snz_uid;

drop table if exists #msd_households;
select 
snz_household_uid
,snz_msd_application_uid
,count(*) as [count]
,min(isnull(msd_tenancy_start_date,'3999-12-31')) as min_start_date
,min(isnull(hnz_ts_snapshot_date,'3999-12-31')) as msd_min_snapshot 
,max(isnull(hnz_ts_snapshot_date,'1900-01-01')) as msd_max_snapshot
into #msd_households  
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where snz_msd_application_uid is not null and msd_provider_name_text != 'CHP'
group by snz_household_uid, snz_msd_application_uid;

drop table if exists #msd_households1;
select 
a.snz_household_uid
,a.snz_msd_application_uid
,a.[count]
,a.min_start_date
,min(b.msd_min_snapshot) as msd_min_snapshot
,max(b.msd_max_snapshot) as msd_max_snapshot
,sum(b.[count]) as count_total_msd
,count(b.snz_household_uid) as count_households
into #msd_households1
from #msd_households a
left join #msd_households b on a.snz_msd_application_uid = b.snz_msd_application_uid
group by a.snz_household_uid,a.snz_msd_application_uid,a.[count],a.min_start_date;

drop table if exists #msd_households1a;
select 
snz_household_uid
,snz_msd_application_uid
,min_start_date
,msd_min_snapshot
,msd_max_snapshot
,count_total_msd
,[count]
,row_number() over (partition by snz_msd_application_uid order by snz_msd_application_uid,[count] desc,snz_household_uid) as row_num
into #msd_households1a
from #msd_households1;

/* Use most common household_uid */
drop table if exists #msd_households2;
select 
snz_household_uid
,snz_msd_application_uid
,min_start_date
,msd_min_snapshot
,msd_max_snapshot
into #msd_households2
from #msd_households1a
where [count] > 0.5*count_total_msd or ([count] = 0.5*count_total_msd and row_num = 1);

/* Add on primary tenant */
drop table if exists #msd_households3;
select 
a.*
,b.snz_uid as prim_tenant
into #msd_households3
from #msd_households2 a
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b on a.snz_msd_application_uid = b.snz_msd_application_uid and a.msd_min_snapshot = b.hnz_ts_snapshot_date;

/* Link HNZ and MSD data */
/* Get last observed household from HNZ data */
drop table if exists #hnz_last_snapshot1;
select
snz_household_uid 
,snz_uid as hnz_pt 
,max(hnz_ts_snapshot_date) as hnz_last_snapshot
,max(hnz_ts_house_entry_date) as hnz_entry_date 
into #hnz_last_snapshot1
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where hnz_ts_snapshot_date <= '2015-08-31' and snz_household_uid is not null
group by snz_household_uid,snz_uid;

drop table if exists #hnz_last_snapshot2;	
select 
*
,row_number() over (partition by snz_household_uid order by hnz_last_snapshot desc, hnz_entry_date desc) as rnk
into #hnz_last_snapshot2
from #hnz_last_snapshot1;

drop table if exists #hnz_last_snapshot3;
select 
snz_household_uid
,hnz_pt
,hnz_last_snapshot
,hnz_entry_date
into #hnz_last_snapshot3
from #hnz_last_snapshot2
where rnk = 1;

/* Merge data */
drop table if exists #hnz_msd_match1;
select
a.snz_household_uid
,b.snz_msd_application_uid
,iif(a.hnz_pt = b.prim_tenant, 1, 0) as pt_match
,iif(a.hnz_entry_date = b.min_start_date, 1, 0) as entry_match
,b.min_start_date as msd_entry_date
,b.msd_min_snapshot
,b.msd_max_snapshot
into #hnz_msd_match1
from #hnz_last_snapshot3 a
inner join #msd_households3 b on a.snz_household_uid = b.snz_household_uid
where b.min_start_date <= '2015-08-31';

/* Rank to dedup */
drop table if exists #hnz_msd_match2;
select 
* 
,row_number() over (partition by snz_household_uid order by pt_match desc, entry_match desc, msd_entry_date, msd_max_snapshot desc) as rnk
into #hnz_msd_match2
from #hnz_msd_match1;

drop table if exists #hnz_msd_match3;
select 
snz_household_uid
,snz_msd_application_uid
into #hnz_msd_match3
from #hnz_msd_match2
where rnk = 1;

/* Join matched households onto hnz table */
drop table if exists #household_id3;
select 
a.*
,b.snz_msd_application_uid
,a.snz_household_uid as orig_snz_household_uid
into #household_id3
from #household_id2 a
left join #hnz_msd_match3 b on a.snz_household_uid = b.snz_household_uid;

/* Tidy new households in MSD data */
/* Get all MSD households not match to HNZ data */
drop table if exists #new_msd_hh;
select 
a.*
into #new_msd_hh
from (select distinct 
	  snz_household_uid
	  ,snz_msd_application_uid
	  from #msd_households3
     ) a
left join #household_id3 b on a.snz_msd_application_uid = b.snz_msd_application_uid 
where b.snz_household_uid is null;

/* Null snz_household_uid if it exists in HNZ data */
drop table if exists #new_msd_hh2;
select 
iif(b.snz_household_uid is null, a.snz_household_uid, null) as snz_household_uid
,a.snz_msd_application_uid
,a.snz_household_uid as orig_snz_household_uid
into #new_msd_hh2
from #new_msd_hh a
left join (select distinct 
	       snz_household_uid 
	       from #household_id3 
	       where snz_household_uid is not null
          ) b on a.snz_household_uid = b.snz_household_uid;

drop table if exists #dup_rank;
select
snz_household_uid
,snz_msd_application_uid
,row_number() over (partition by snz_household_uid order by entry_date, max_snapshot desc) as rnk
into #dup_rank
from (select 
	  a.snz_household_uid
	  ,snz_msd_application_uid
	  ,min(min_start_date) as entry_date
	  ,max(msd_max_snapshot) as max_snapshot
	  from #msd_households3 a
	  inner join (select 
		          snz_household_uid 
		          from #new_msd_hh2 
		          group by snz_household_uid
		          having count(*) > 1
	             ) b on a.snz_household_uid = b.snz_household_uid
	  group by a.snz_household_uid, snz_msd_application_uid
     ) subqry;

drop table if exists #new_msd_hh3;
select 
iif(b.snz_household_uid is not null and rnk > 1, null, a.snz_household_uid) as snz_household_uid
,NULL as snz_legacy_household_uid
,a.snz_msd_application_uid
,a.orig_snz_household_uid
into #new_msd_hh3
from #new_msd_hh2 a
left join #dup_rank b on a.snz_msd_application_uid = b.snz_msd_application_uid;

/* Add in CHP households */
drop table if exists #chps;
select 
NULL as snz_household_uid
,NULL as snz_legacy_household_uid
,snz_msd_application_uid
,NULL as orig_snz_household_uid
into #chps
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where msd_provider_name_text = 'CHP';

/* Append all household IDS */
drop table if exists #household_id4;
select * into #household_id4 from #household_id3
union
select * from #new_msd_hh3
union
select * from #chps;

/* Identify non-unique snz_msd_application_uid */
drop table if exists #msd_app_dups1;
select 
snz_msd_application_uid 
into #msd_app_dups1
from #household_id4 
group by snz_msd_application_uid 
having count(*) > 1;

drop table if exists #msd_app_dups2;
select 
snz_msd_application_uid 
into #msd_app_dups2
from #msd_app_dups1 
where snz_msd_application_uid is not null;

delete from #household_id4 
where snz_msd_application_uid in (select distinct snz_msd_application_uid from #msd_app_dups2) and orig_snz_household_uid is null;

drop table if exists #household_id5;
select 
row_number() over (order by snz_legacy_household_uid,snz_household_uid,snz_msd_application_uid) as household_id
,*
into #household_id5
from #household_id4;

/* Do some further work on deduping msd households - look at those who have more than 1 original snz_household_uid */
drop table if exists #dup_hh;
select 
orig_snz_household_uid
,count(*) as [count]
into #dup_hh
from #household_id5
group by orig_snz_household_uid
having count(*) > 1;

drop table if exists #dup_hh1;
select 
a.*
,b.[count]
into #dup_hh1
from #household_id5 a
left join #dup_hh b on b.orig_snz_household_uid = a.orig_snz_household_uid
where b.orig_snz_household_uid is not null;

drop table if exists #dup_hh2;
select 
a.household_id
,a.snz_household_uid
,a.snz_legacy_household_uid
,a.snz_msd_application_uid
,a.orig_snz_household_uid
,min(isnull(b.hnz_ts_snapshot_date,'3999-12-31')) as min_snap
,max(isnull(b.hnz_ts_snapshot_date,'1900-01-01')) as max_snap
,min(isnull(b.msd_tenancy_start_date,'3999-12-31')) as entry_date
into #dup_hh2
from #dup_hh1 a
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b on a.snz_msd_application_uid = b.snz_msd_application_uid
group by a.household_id,a.snz_household_uid,a.snz_legacy_household_uid,a.snz_msd_application_uid,a.orig_snz_household_uid;

drop table if exists #dup_hh3;
select 
a.*
,iif(a.snz_household_uid is not null,1,0) as prim
,iif(b.entry_date = '3999-12-31',null,b.entry_date) as prim_entry_date
,b.household_id as prim_household_id
into #dup_hh3
from #dup_hh2 a
left join #dup_hh2 b on a.orig_snz_household_uid = b.snz_household_uid and b.snz_household_uid is not null
order by orig_snz_household_uid;

/* Work out what is actually the same household and proceed */
drop table if exists #dup_hh4;
select 
prim_household_id as household_id
,orig_snz_household_uid as snz_household_uid
,snz_legacy_household_uid
,snz_msd_application_uid
,orig_snz_household_uid
into #dup_hh4
from #dup_hh3
where prim = 0 and entry_date = prim_entry_date;

/* Overwrite the ids where we've found better ones */
drop table if exists #household_id6;
select
coalesce(b.household_id,a.household_id) as household_id
,coalesce(b.snz_household_uid,a.snz_household_uid) as snz_household_uid
,a.snz_legacy_household_uid
,a.snz_msd_application_uid
,a.orig_snz_household_uid
into #household_id6
from #household_id5 a
left join #dup_hh4 b on a.snz_msd_application_uid = b.snz_msd_application_uid;

/* Create list of all household ids */
drop table if exists #tenancy_household_id_$(YYYYMM);
create table #tenancy_household_id_$(YYYYMM) (
household_id int not null,
link_set_key smallint not null,
snz_household_uid int null,
snz_legacy_household_uid int null,
snz_msd_application_uid int null,
orig_snz_household_uid int null
);

declare @link_set int
set @link_set = (select top 1 link_set_key from [$(idicleanversion)].[data].[personal_detail]);
insert into #tenancy_household_id_$(YYYYMM)

select 
household_id,
@link_set as link_set_key,
snz_household_uid,
snz_legacy_household_uid,
snz_msd_application_uid,
orig_snz_household_uid
from #household_id6;

/* HNZ houses - Find any unmatched houses in the house dataset */
drop table if exists #all_hnz_houses;
select distinct 
snz_house_uid
,snz_legacy_house_uid
into #all_hnz_houses
from [$(idicleanversion)].[hnz_clean].[houses_snapshot];

drop table if exists #all_hnz_houses1;
with matched_leg_id as (
select * 
from #all_hnz_houses 
where snz_house_uid is not null and snz_legacy_house_uid is not null
)
select distinct
coalesce(a.snz_house_uid, c.snz_house_uid) as snz_house_uid,
coalesce(a.snz_legacy_house_uid, b.snz_legacy_house_uid) as snz_legacy_house_uid
into #all_hnz_houses1
from #all_hnz_houses a
left join matched_leg_id b on a.snz_house_uid = b.snz_house_uid
left join matched_leg_id c on a.snz_legacy_house_uid = c.snz_legacy_house_uid

drop table if exists #hi_dups;
select distinct
snz_house_uid,
snz_legacy_house_uid,
count(*) as obs
into #hi_dups
from [$(idicleanversion)].[hnz_clean].[houses_snapshot]
where snz_legacy_house_uid in (select 
							   snz_legacy_house_uid
							   from #all_hnz_houses1 
							   where snz_legacy_house_uid is not null 
							   group by snz_legacy_house_uid 
							   having count(*) > 1
						       )
group by snz_house_uid,snz_legacy_house_uid;

/* Rank to dedup */
drop table if exists #hi_dups1;
select 
*, 
row_number() over (partition by snz_legacy_house_uid order by obs asc) as rnk
into #hi_dups1
from #hi_dups
where snz_house_uid is not null;

drop table if exists #hi_dups2;
select *
into #hi_dups2
from #hi_dups1
where rnk = 1;

/* NULL the conflicting values */
drop table if exists #all_hnz_houses2;
select 
a.snz_house_uid
,iif(b.snz_house_uid is not null,NULL,a.snz_legacy_house_uid) as snz_legacy_house_uid
into #all_hnz_houses2
from #all_hnz_houses1 a
left join #hi_dups2 b on a.snz_house_uid = b.snz_house_uid and a.snz_legacy_house_uid = b.snz_legacy_house_uid;

/* Tenanted Houses - Extract households in tenancy data with corresponding house IDs */
drop table if exists #tenancy_houses;
select
h1.household_id as household_id,
a.snz_household_uid,
iif(a.snz_legacy_household_uid = 1, null, a.snz_legacy_household_uid) as snz_legacy_household_uid,
convert(date, a.hnz_ts_snapshot_date) as snapshot_date,
a.snz_hnz_ts_house_uid,
a.snz_hnz_ts_legacy_house_uid,
a.snz_msd_house_uid
into #tenancy_houses
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] a
left join #tenancy_household_id_$(YYYYMM) h1 on a.snz_household_uid = h1.snz_household_uid 
where a.snz_household_uid is not null
union
select
l1.household_id as household_id,
b.snz_household_uid,
iif(b.snz_legacy_household_uid = 1, null, b.snz_legacy_household_uid) as snz_legacy_household_uid,
convert(date, b.hnz_ts_snapshot_date) as snapshot_date,
b.snz_hnz_ts_house_uid,
b.snz_hnz_ts_legacy_house_uid,
b.snz_msd_house_uid
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b
left join #tenancy_household_id_$(YYYYMM) l1 on b.snz_legacy_household_uid = l1.snz_legacy_household_uid
where b.snz_household_uid is null and b.snz_legacy_household_uid is not null
union
select
m1.household_id as household_id,
c.snz_household_uid as snz_household_uid,
c.snz_legacy_household_uid,
convert(date, c.hnz_ts_snapshot_date) as snapshot_date,
c.snz_hnz_ts_house_uid,
c.snz_hnz_ts_legacy_house_uid,
iif(c.snz_msd_house_uid = 1, null, c.snz_msd_house_uid) as snz_msd_house_uid
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] c
left join #tenancy_household_id_$(YYYYMM) m1 on c.snz_msd_application_uid = m1.snz_msd_application_uid
where c.snz_msd_application_uid is not null
union 
select
l2.household_id as household_id,
e.snz_household_uid,
iif(e.snz_legacy_household_uid = 1, null, e.snz_legacy_household_uid) as snz_legacy_household_uid,
convert(date, e.hnz_ts_snapshot_date) as snapshot_date,
e.snz_hnz_ts_house_uid,
e.snz_hnz_ts_legacy_house_uid,
null as snz_msd_house_uid
from [IDI_Adhoc].[clean_read_HNZ].[adhoc_clean_tenancy_snapshot] e
left join #tenancy_household_id_$(YYYYMM) l2 on e.snz_legacy_household_uid = l2.snz_legacy_household_uid
where e.hnz_ts_snapshot_date <= '2010-10-31';

/* Match on HNZ house dataset */
drop table if exists #tenancy_houses2;
select 
a.*,
b.snz_house_uid, 
b.snz_legacy_house_uid
into #tenancy_houses2
from #tenancy_houses a
left join [$(idicleanversion)].[hnz_clean].[houses_snapshot] b on a.snz_household_uid = b.snz_household_uid and a.snapshot_date = b.hnz_hs_snapshot_date
where a.snz_household_uid is not null
union all
select 
c.*,
d.snz_house_uid, 
d.snz_legacy_house_uid
from #tenancy_houses c
left join [$(idicleanversion)].[hnz_clean].[houses_snapshot] d on c.snz_legacy_household_uid = d.snz_legacy_household_uid and c.snapshot_date = d.hnz_hs_snapshot_date
where c.snz_household_uid is null and c.snz_legacy_household_uid is not null;

/* Get all HNZ houses */
drop table if exists #matched_houses;
select distinct 
coalesce(snz_house_uid, snz_hnz_ts_house_uid) as snz_house_uid
,coalesce(snz_legacy_house_uid, snz_hnz_ts_legacy_house_uid) as snz_legacy_house_uid
into #matched_houses
from #tenancy_houses2;

/* Match up house ID and legacy house IDs */
drop table if exists #matched_houses2;
select distinct
coalesce(a.snz_house_uid, c.snz_house_uid) as snz_house_uid
,coalesce(a.snz_legacy_house_uid, b.snz_legacy_house_uid) as snz_legacy_house_uid
into #matched_houses2
from #matched_houses a
left join #all_hnz_houses2 b on a.snz_house_uid = b.snz_house_uid
left join #all_hnz_houses2 c on a.snz_legacy_house_uid = c.snz_legacy_house_uid;

/* Houses unmatched */
drop table if exists #unmatched_houses;
select 
a.*
into #unmatched_houses
from #all_hnz_houses2 a
left join #matched_houses2 b on coalesce(a.snz_house_uid, 0) = coalesce(b.snz_house_uid, 0) and coalesce(a.snz_legacy_house_uid, 0) = coalesce(b.snz_legacy_house_uid, 0)
where b.snz_house_uid is null and b.snz_legacy_house_uid is null;

/* MSD houses - add on MSD houses including household id */
drop table if exists #msd_house;
select distinct 
snz_msd_house_uid
,snz_household_uid
into #msd_house
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where snz_msd_house_uid is not null;

/* Create full list of all houses */
drop table if exists #all_houses;
select 
snz_house_uid
,snz_legacy_house_uid
,null as snz_msd_house_uid
,null as snz_household_uid
into #all_houses
from #matched_houses2 
union
select 
snz_house_uid
,snz_legacy_house_uid
,null as snz_msd_house_uid
,null as snz_household_uid
from #unmatched_houses
union
select distinct 
null as snz_house_uid
,null as snz_legacy_house_uid
,snz_msd_house_uid
,snz_household_uid
from #msd_house;

/* Create primary id */
drop table if exists #all_houses1;
select 
row_number() over (order by snz_msd_house_uid,snz_house_uid,snz_legacy_house_uid,snz_household_uid) as house_id
,a.*
into #all_houses1
from #all_houses a
where snz_msd_house_uid is not null or snz_house_uid is not null or snz_legacy_house_uid is not null;

/* Look at matching msd houses in more detail */
drop table if exists #msd_tenancy_houses;
select 
snz_msd_house_uid
,iif(msd_provider_name_text = 'CHP',NULL,snz_household_uid) as snz_household_uid
,snz_idi_address_register_uid
,min(hnz_ts_snapshot_date) as min_snap
,max(hnz_ts_snapshot_date) as max_snap
into #msd_tenancy_houses
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where snz_msd_house_uid is not null
group by snz_msd_house_uid,iif(msd_provider_name_text = 'CHP',NULL,snz_household_uid),snz_idi_address_register_uid;

/* There are plenty of households that change house id in the stats number but appears to be the same house. Want to give these two the same ID. There is probably a number of cases like this for the non-■■■■■■■■■■■■ */
drop table if exists #msd_tenancy_houses1;
select distinct 
snz_msd_house_uid
,a.snz_household_uid
,a.snz_idi_address_register_uid
,max_snap
,min_snap
,b.snz_house_uid
into #msd_tenancy_houses1
from #msd_tenancy_houses a
left join [$(idicleanversion)].[hnz_clean].[houses_snapshot] b on a.snz_household_uid = b.snz_household_uid and (a.snz_idi_address_register_uid is null or a.snz_idi_address_register_uid = b.snz_idi_address_register_uid or b.snz_idi_address_register_uid is null)
where a.snz_household_uid is not null;

drop table if exists #msd_tenancy_houses1_counts;
select 
snz_msd_house_uid
,snz_household_uid
,min_snap
,max_snap
,count(distinct(snz_house_uid)) as [count]
into #msd_tenancy_houses1_counts
from #msd_tenancy_houses1
where snz_house_uid is not null
group by snz_msd_house_uid,snz_household_uid,min_snap,max_snap;

drop table if exists #msd_tenancy_houses2;
select 
a.*
into #msd_tenancy_houses2
from #msd_tenancy_houses1 a
left join #msd_tenancy_houses1_counts b on a.snz_msd_house_uid = b.snz_msd_house_uid and a.snz_household_uid = b.snz_household_uid
where b.count = 1 and snz_house_uid is not null;

drop table if exists #msd_tenancy_houses2a;
select 
a.*
into #msd_tenancy_houses2a
from #msd_tenancy_houses1 a
left join #msd_tenancy_houses1_counts b on a.snz_msd_house_uid = b.snz_msd_house_uid and a.snz_household_uid = b.snz_household_uid
where b.count > 1 and snz_house_uid is not null;

/* Dedup snz_house_id where we know they are the same (ie same address/household in there) and determine what the new snz_house_uid should be for the duplicates */
drop table if exists #msd_tenancy_houses2a_1;
select 
a.*
,min(b.snz_house_uid) as new_house_uid
into #msd_tenancy_houses2a_1
from #msd_tenancy_houses2a a
left join #msd_tenancy_houses2a b on a.snz_msd_house_uid = b.snz_msd_house_uid and a.snz_household_uid = b.snz_household_uid
group by a.snz_msd_house_uid,a.snz_household_uid,a.snz_idi_address_register_uid,a.max_snap,a.min_snap,a.snz_house_uid;

/* Add them to main list */
drop table if exists #all_houses2;
select distinct 
coalesce(c.house_id,a.house_id) as house_id
,a.snz_house_uid
,a.snz_legacy_house_uid
,a.snz_msd_house_uid
,a.snz_household_uid
into #all_houses2
from #all_houses1 a
left join #msd_tenancy_houses2a_1 b on a.snz_house_uid = b.snz_house_uid
left join #all_houses1 c on b.new_house_uid = c.snz_house_uid and b.new_house_uid is not null;

/* Now to add the correct house_id for the msd id's that got deduped */
drop table if exists #all_houses3;
select distinct 
coalesce(c.house_id,a.house_id) as house_id
,a.snz_house_uid
,a.snz_legacy_house_uid
,a.snz_msd_house_uid
,a.snz_household_uid
into #all_houses3
from #all_houses2 a
left join #msd_tenancy_houses2a_1 b on a.snz_msd_house_uid = b.snz_msd_house_uid and b.snz_msd_house_uid is not null and a.snz_household_uid = b.snz_household_uid
left join #all_houses1 c on b.new_house_uid = c.snz_house_uid and b.new_house_uid is not null;

/* Now to combine id's for msd houses we are sure about (ie only one combination) - need to merge by msd and household to get id */
drop table if exists #all_houses4;
select distinct 
coalesce(c.house_id,a.house_id) as house_id
,a.snz_house_uid
,a.snz_legacy_house_uid
,a.snz_msd_house_uid,
a.snz_household_uid
into #all_houses4
from #all_houses3 a
left join #msd_tenancy_houses2 b on a.snz_msd_house_uid = b.snz_msd_house_uid and b.snz_msd_house_uid is not null and a.snz_household_uid = b.snz_household_uid
left join #all_houses1 c on b.snz_house_uid = c.snz_house_uid and b.snz_house_uid is not null;

/* Save table */
drop table if exists #tenancy_house_id_$(YYYYMM);
create table #tenancy_house_id_$(YYYYMM) (
house_id int not null,
link_set_key smallint not null,
snz_house_uid int null,
snz_legacy_house_uid int null,
snz_msd_house_uid int null,
snz_household_uid int null
); 

set @link_set = (select top 1 link_set_key from [$(idicleanversion)].[data].[personal_detail]);
insert into #tenancy_house_id_$(YYYYMM)

select 
house_id,
@link_set as link_set_key,
snz_house_uid,
snz_legacy_house_uid,
snz_msd_house_uid,
snz_household_uid
from #all_houses4;

/* Tidy up the tenancy table */
/* Add in missing data from adhoc tables */
drop table if exists #test;
select 
* 
into #test
from IDI_Adhoc.clean_read_HNZ.adhoc_clean_tenancy_snapshot
where hnz_ts_snapshot_date <= '2010-10-31';

/* Get the snz_msd_uid where they are not created by stats */
drop table if exists #nonstats;
select top 1 
snz_msd_uid
,count(*) as n 
into #nonstats
from #test 
group by snz_msd_uid 
order by n desc;

/* Find next and last time the household was in the main tenancy_snapshot data */
drop table if exists #temp;
select 
a.hnz_ts_snapshot_date,
a.snz_legacy_household_uid, 
a.snz_msd_uid, 
min(case when a.hnz_ts_snapshot_date <= b.hnz_ts_snapshot_date then b.hnz_ts_snapshot_date else '3999-12-31' end) as next_snapshot_date,
max(case when a.hnz_ts_snapshot_date >= b.hnz_ts_snapshot_date then b.hnz_ts_snapshot_date else '1900-01-01' end) as last_snapshot_date
into #temp
from #test a
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b on a.snz_legacy_household_uid = b.snz_legacy_household_uid
group by a.hnz_ts_snapshot_date,a.snz_legacy_household_uid, a.snz_msd_uid;

/* Bring through their msd_uid the last and next msd_uid in the main tenancy data. Also bring on primary from household data. Determine best msd_uid options. */
drop table if exists #temp2;
select 
a.hnz_ts_snapshot_date
,a.snz_legacy_household_uid
,a.snz_msd_uid as curr_msd
,b.snz_msd_uid as next_msd
,c.snz_msd_uid as last_msd
,g.snz_msd_uid as primary_msd
,i.snz_msd_uid as primary_msd_main
,case when a.snz_msd_uid = b.snz_msd_uid and b.snz_msd_uid = c.snz_msd_uid then 1 else 0 end as next_last_curr_same
,case when b.snz_msd_uid = c.snz_msd_uid then 1 else 0 end as next_last_same
,case when b.snz_msd_uid is null and c.snz_msd_uid is null then 1 else 0 end as no_entries
,case when d.snz_spine_uid is null then 0 else 1 end as spine_msd
,case when e.snz_spine_uid is null then 0 else 1 end as spine_next
,case when f.snz_spine_uid is null then 0 else 1 end as spine_last
,case when h.snz_spine_uid is null then 0 else 1 end as spine_primary
,case when d.snz_spine_uid is null and e.snz_spine_uid is null and f.snz_spine_uid is null then 0 else 1 end as spine_any
,case 
	when a.snz_msd_uid = b.snz_msd_uid and a.snz_msd_uid= c.snz_msd_uid then a.snz_msd_uid /* i.e. all the same */
	when d.snz_spine_uid is not null then a.snz_msd_uid /* i.e. current is on the spine */
	when b.snz_msd_uid = c.snz_msd_uid then b.snz_msd_uid /* i.e. next/last is the same */
	when g.snz_msd_uid = b.snz_msd_uid then b.snz_msd_uid /* i.e. last same as primary tenant */
	when g.snz_msd_uid = c.snz_msd_uid then c.snz_msd_uid /* i.e. next same as primary tenant */ 
	when h.snz_spine_uid is not null then g.snz_msd_uid /* i.e. primary tenant is on the spine */
	when e.snz_spine_uid is not null then b.snz_msd_uid /* i.e. next is on the spine */
	when f.snz_spine_uid is not null then c.snz_msd_uid  /* i.e. last is on the spine */
	when a.snz_msd_uid <> j.snz_msd_uid then a.snz_msd_uid /* i.e. when they are not created by stats */
	else coalesce(g.snz_msd_uid,b.snz_msd_uid,c.snz_msd_uid) /* i.e. take either primary/next/last if not null */
end as new_snz_msd
into #temp2
from #temp a
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b on a.snz_legacy_household_uid = b.snz_legacy_household_uid and a.next_snapshot_date = b.hnz_ts_snapshot_date
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] c on a.snz_legacy_household_uid = c.snz_legacy_household_uid and a.last_snapshot_date = c.hnz_ts_snapshot_date
left join [$(idicleanversion)].[security].concordance d on a.snz_msd_uid = d.snz_msd_uid
left join [$(idicleanversion)].[security].concordance e on b.snz_msd_uid = e.snz_msd_uid
left join [$(idicleanversion)].[security].concordance f on c.snz_msd_uid = f.snz_msd_uid
left join [IDI_Adhoc].[clean_read_HNZ].[adhoc_clean_tenancy_h_snapshot] g on a.snz_legacy_household_uid = g.snz_legacy_household_uid and a.hnz_ts_snapshot_date = g.hnz_ths_snapshot_date and g.hnz_ths_app_relship_text = 'PRIMARY'
left join [$(idicleanversion)].[security].concordance h on g.snz_msd_uid = h.snz_msd_uid
left join [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot] i on a.snz_legacy_household_uid = i.snz_legacy_household_uid and a.hnz_ts_snapshot_date = i.hnz_ths_snapshot_date and i.hnz_ths_app_relship_text = 'PRIMARY'
left join #nonstats j on a.snz_msd_uid = j.snz_msd_uid;

/* There above query creates some duplicates as there are some houses have more than one primary tenants */
drop table if exists #temp2_distinct;
select distinct 
snz_legacy_household_uid
,hnz_ts_snapshot_date
,new_snz_msd 
into #temp2_distinct
from #temp2;

/* Prioritise matches */
drop table if exists #dup_priority;
select 
a.snz_legacy_household_uid,
a.hnz_ts_snapshot_date,
a.new_snz_msd,
case
	when spine_next = 1 and new_snz_msd = next_msd then 1
	when spine_last = 1 and new_snz_msd = last_msd then 2
	when spine_primary = 1 and new_snz_msd = primary_msd then 3
	when spine_next = 0 and new_snz_msd = next_msd then 4
	when spine_last = 0 and new_snz_msd = last_msd then 5
	when spine_primary = 0 and new_snz_msd = primary_msd then 6
	else 7 end as record_priority
into #dup_priority
from #temp2 a
inner join (select 
            snz_legacy_household_uid
			,hnz_ts_snapshot_date
			from #temp2_distinct
			group by snz_legacy_household_uid, hnz_ts_snapshot_date
			having count(*) > 1
			) b on a.snz_legacy_household_uid = b.snz_legacy_household_uid and a.hnz_ts_snapshot_date = b.hnz_ts_snapshot_date;

/* Select record with max priority */
drop table if exists #dup_max_priority;
select 
snz_legacy_household_uid
,hnz_ts_snapshot_date
,new_snz_msd
into #dup_max_priority
from (select 
      *, 
	  row_number() over (partition by snz_legacy_household_uid, hnz_ts_snapshot_date order by snz_legacy_household_uid, hnz_ts_snapshot_date, record_priority, new_snz_msd) as row_id
	  from #dup_priority
	  ) subqry
where row_id = 1;

drop table if exists #temp2_dedup;
select distinct 
a.snz_legacy_household_uid
,a.hnz_ts_snapshot_date
,coalesce(b.new_snz_msd, a.new_snz_msd) as new_snz_msd
into #temp2_dedup
from #temp2_distinct a
left join #dup_max_priority b on a.snz_legacy_household_uid = b.snz_legacy_household_uid and a.hnz_ts_snapshot_date = b.hnz_ts_snapshot_date;

/* Add the new snz_uid to the ad-hoc table */
drop table if exists #test_newid;
select 
hnz_ts_snapshot_date
,snz_household_uid
,snz_legacy_household_uid
,legacy_snz_msd_uid
,hnz_ts_hshd_size_nbr
,hnz_ts_house_entry_date
,hnz_ts_social_house_entry_date
,snz_hnz_ts_house_uid
,snz_hnz_ts_legacy_house_uid
,hnz_ts_hshd_weekly_income_nbr
,hnz_ts_inc_related_rent_nbr
,hnz_ts_inc_reltd_rent_subsdy_nbr
,hnz_ts_market_rent_nbr
,hnz_ts_hshd_type_text
into #test_newid
from #test;

drop table if exists #temp3;
select 
a.*,
b.new_snz_msd as snz_msd_uid,
c.snz_uid,
case when c.snz_spine_uid is null then 0 else 1 end as spine_ind
into #temp3
from #test_newid a
left join #temp2_dedup b on a.hnz_ts_snapshot_date = b.hnz_ts_snapshot_date and a.snz_legacy_household_uid = b.snz_legacy_household_uid
left join [$(idicleanversion)].[security].[concordance] c on b.new_snz_msd = c.snz_msd_uid;

/* Flag records in original data */
drop table if exists #temp4;
select 
a.*
,iif(b.snz_msd_uid is null, 1, 0) as adj_ind
into #temp4
from #temp3 a
left join [$(idicleanversion)].[hnz_clean].[tenancy_snapshot] b on a.snz_legacy_household_uid = b.snz_legacy_household_uid and a.hnz_ts_snapshot_date = b.hnz_ts_snapshot_date and a.snz_msd_uid = b.snz_msd_uid;

/* Append old and new data */
drop table if exists #temp5;
select 
snz_uid,
snz_msd_uid,
snz_household_uid,
snz_legacy_household_uid,
null as snz_msd_application_uid,
snz_hnz_ts_house_uid,
snz_hnz_ts_legacy_house_uid,
null as snz_msd_house_uid,
convert(date, hnz_ts_snapshot_date) as snapshot_date,
hnz_ts_house_entry_date as tenancy_start_date,
convert(date, hnz_ts_social_house_entry_date) as social_house_entry_date,
hnz_ts_hshd_size_nbr as hshd_size,
hnz_ts_hshd_type_text as hshd_type,
hnz_ts_hshd_weekly_income_nbr as hshd_weekly_income,
hnz_ts_inc_related_rent_nbr as inc_related_rent,
hnz_ts_inc_reltd_rent_subsdy_nbr as inc_reltd_rent_subsdy,
hnz_ts_market_rent_nbr as market_rent,
null as bedroom_required_count,
null as provider_name,
null as snz_idi_address_register_uid,
null as msd_ts_meshblock_code,
adj_ind
into #temp5
from #temp4
union all
select 
snz_uid,
snz_msd_uid,
snz_household_uid,
snz_legacy_household_uid,
snz_msd_application_uid,
snz_hnz_ts_house_uid,
snz_hnz_ts_legacy_house_uid,
snz_msd_house_uid,
convert(date, hnz_ts_snapshot_date) as snapshot_date,
coalesce(hnz_ts_house_entry_date, msd_tenancy_start_date) as tenancy_start_date,
convert(date, hnz_ts_social_house_entry_date) as social_house_entry_date,
hnz_ts_hshd_size_nbr as hshd_size,
hnz_ts_hshd_type_text as hshd_type,
hnz_ts_hshd_weekly_income_nbr as hshd_weekly_income,
hnz_ts_inc_related_rent_nbr as inc_related_rent,
hnz_ts_inc_reltd_rent_subsdy_nbr as inc_reltd_rent_subsdy,
hnz_ts_market_rent_nbr as market_rent,
msd_bedroom_required_count_nbr as bedroom_required_count,
msd_provider_name_text as provider_name,
snz_idi_address_register_uid,
msd_ts_meshblock_code,
0 as adj_ind
from [$(idicleanversion)].[hnz_clean].[tenancy_snapshot]
where hnz_ts_snapshot_date > '2010-10-31';

/* Match on new household ID and house ID variables */
drop table if exists #temp6_hnz;
select distinct
snz_uid,
snz_msd_uid,
h1.household_id,
h2.house_id,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
snz_idi_address_register_uid,
msd_ts_meshblock_code,
adj_ind
into #temp6_hnz
from #temp5 a
left join #tenancy_household_id_$(YYYYMM) h1 on a.snz_household_uid = h1.snz_household_uid 
left join #tenancy_house_id_$(YYYYMM) h2 on a.snz_hnz_ts_house_uid = h2.snz_house_uid
where a.snz_household_uid is not null and a.snz_msd_application_uid is null
union all
select 
snz_uid,
snz_msd_uid,
l1.household_id,
l2.house_id,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
snz_idi_address_register_uid,
msd_ts_meshblock_code,
adj_ind
from #temp5 b
left join #tenancy_household_id_$(YYYYMM) l1 on b.snz_legacy_household_uid = l1.snz_legacy_household_uid 
left join #tenancy_house_id_$(YYYYMM) l2 on b.snz_hnz_ts_legacy_house_uid = l2.snz_legacy_house_uid
where b.snz_legacy_household_uid is not null and b.snz_household_uid is null;

/* For MSD merge we need to also merge on household_id */
drop table if exists #temp6_msd1;
select distinct
snz_uid,
snz_msd_uid,
m1.household_id,
m2.house_id as house_id,
c.snz_msd_house_uid,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
adj_ind,
snz_idi_address_register_uid,
msd_ts_meshblock_code
into #temp6_msd1
from #temp5 c
left join #tenancy_household_id_$(YYYYMM) m1 on c.snz_msd_application_uid = m1.snz_msd_application_uid
left join #tenancy_house_id_$(YYYYMM) m2 on c.snz_msd_house_uid = m2.snz_msd_house_uid and c.snz_household_uid = m2.snz_household_uid
where c.snz_msd_application_uid is not null;

/* Dedup the household by taking the first one that isn't null */
drop table if exists #temp6_msd2;
select 
snz_uid,
snz_msd_uid,
household_id,
house_id,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
snz_idi_address_register_uid,
msd_ts_meshblock_code,
adj_ind
into #temp6_msd2
from (select 
      row_number() over (partition by a.household_id,a.snapshot_date order by a.household_id,a.snapshot_date,a.house_id desc,a.snz_uid) as row_num
	  ,a.*
      from #temp6_msd1 a
     ) b
where row_num = 1;

drop table if exists #temp6;
select * into #temp6 from #temp6_hnz
union all
select * from #temp6_msd2;

/* Backfill records where snapshot dates are missing */
drop table if exists #hh_missing_snapshots;
select 
household_id, 
min(snapshot_date) as min_snapshot_date,
max(snapshot_date) as max_snapshot_date
into #hh_missing_snapshots
from #temp6
group by household_id
having datediff(month, min(snapshot_date), max(snapshot_date)) + 1 != count(*);

/* Get full list of snapshot dates */
drop table if exists #snapshot_dates;
select distinct 
snapshot_date 
into #snapshot_dates 
from #temp6;

drop table if exists #hh_missing_snapshots_all;
select 
a.household_id,
b.snapshot_date
into #hh_missing_snapshots_all
from #hh_missing_snapshots a
cross join #snapshot_dates b
where b.snapshot_date between a.min_snapshot_date and a.max_snapshot_date;

drop table if exists #missing_snapshots;
select 
a.*
into #missing_snapshots
from #hh_missing_snapshots_all a
left join #temp6 b on a.household_id = b.household_id and a.snapshot_date = b.snapshot_date
where b.snapshot_date is null;

/* Get next snapshot date */
drop table if exists #missing_snapshots2;
select 
a.*, 
min(b.snapshot_date) as next_snapshot
into #missing_snapshots2
from #missing_snapshots a
inner join #temp6 b on a.household_id = b.household_id
where a.snapshot_date < b.snapshot_date 
group by a.household_id, a.snapshot_date;

drop table if exists #missing_snapshots3;
select 
b.snz_uid,
b.snz_msd_uid,
b.household_id,
b.house_id,
a.snapshot_date,
b.tenancy_start_date,
b.social_house_entry_date,
b.hshd_size,
b.hshd_type,
b.hshd_weekly_income,
b.inc_related_rent,
b.inc_reltd_rent_subsdy,
b.market_rent,
b.bedroom_required_count,
b.provider_name,
b.snz_idi_address_register_uid,
b.msd_ts_meshblock_code,
1 as adj_ind
into #missing_snapshots3
from #missing_snapshots2 a
inner join #temp6 b on a.household_id = b.household_id and a.next_snapshot = b.snapshot_date;

/* Append to snz_uid */
drop table if exists #temp7;
select * into #temp7 from #temp6
union all
select * from #missing_snapshots3;

/* Merge on House exit date */
drop table if exists #tenancy_exits1;
select
coalesce(m1.household_id, h1.household_id, l1.household_id) as household_id,
a.snz_household_uid,
a.snz_legacy_household_uid,
a.snz_msd_application_uid,
convert(date, a.hnz_te_exit_date) as exit_date,
iif(a.hnz_te_exit_status_text = 'TRANSFER', 'transfer', 'exit SH') as exit_type
into #tenancy_exits1
from [$(idicleanversion)].[hnz_clean].[tenancy_exit] a
left join #tenancy_household_id_$(YYYYMM) h1 on a.snz_household_uid = h1.snz_household_uid and h1.snz_household_uid is not null
left join #tenancy_household_id_$(YYYYMM) l1 on a.snz_legacy_household_uid = l1.snz_legacy_household_uid and l1.snz_legacy_household_uid is not null
left join #tenancy_household_id_$(YYYYMM) m1 on a.snz_msd_application_uid = m1.snz_msd_application_uid and m1.snz_msd_application_uid is not null
where hnz_te_exit_date > '2001-01-31';

/* Rank by latest exit */
drop table if exists #tenancy_exits2;
select 
*
,row_number() over (partition by household_id order by exit_date desc, exit_type) as row_id
into #tenancy_exits2
from #tenancy_exits1;

/* De-duplicate */
drop table if exists #tenancy_exits3;
select 
* 
into #tenancy_exits3 
from #tenancy_exits2 
where row_id = 1;

drop table if exists #temp8;
select 
a.*
,b.exit_date
,b.exit_type
into #temp8
from #temp7 a
left join #tenancy_exits3 b on a.household_id = b.household_id;

/* Social house entry date */
drop table if exists #sh_entry_prep1;
select 
snz_uid, 
isnull(social_house_entry_date,'3999-12-31') as social_house_entry_date
,isnull(tenancy_start_date,'3999-12-31') as tenancy_start_date
,isnull(snapshot_date,'3999-12-31') as snapshot_date
into #sh_entry_prep1
from #temp8;

drop table if exists #sh_entry_prep2;
select 
snz_uid, 
min(social_house_entry_date) as social_house_entry_date
,min(tenancy_start_date) as tenancy_start_date
,min(snapshot_date) as snapshot_date
into #sh_entry_prep2
from #sh_entry_prep1
group by snz_uid;

drop table if exists #sh_entry_date;
select 
snz_uid, 
case when social_house_entry_date <> '3999-12-31' then social_house_entry_date
     when tenancy_start_date <> '3999-12-31' then tenancy_start_date
	 when snapshot_date <> '3999-12-31' then snapshot_date else null end as social_house_entry_date
into #sh_entry_date
from #sh_entry_prep2;

drop table if exists #temp9;
select 
a.snz_uid,	
a.snz_msd_uid, 
a.household_id,
a.house_id,
a.snapshot_date,
a.tenancy_start_date,
b.social_house_entry_date,
a.hshd_size,
a.hshd_type,
a.hshd_weekly_income,
a.inc_related_rent,
a.inc_reltd_rent_subsdy,
a.market_rent,
a.bedroom_required_count,
a.provider_name,
a.snz_idi_address_register_uid,
a.msd_ts_meshblock_code as meshblock_code,
a.adj_ind,
a.exit_date,
a.exit_type
into #temp9
from #temp8 a
inner join #sh_entry_date b on a.snz_uid = b.snz_uid;

/* Tidy house ids where tenancies by looking forward a month */
drop table if exists #temp10;
select 
a.snz_uid,	
a.snz_msd_uid, 
a.household_id,
coalesce(b.house_id,a.house_id) as house_id,
a.snapshot_date,
a.tenancy_start_date,
a.social_house_entry_date,
a.hshd_size,
a.hshd_type,
a.hshd_weekly_income,
a.inc_related_rent,
a.inc_reltd_rent_subsdy,
a.market_rent,
a.bedroom_required_count,
a.provider_name,
a.snz_idi_address_register_uid,
a.meshblock_code,
a.adj_ind,
a.exit_date,
a.exit_type
into #temp10
from #temp9 a
left join #temp9 b on a.household_id = b.household_id and b.snapshot_date = eomonth(a.snapshot_date,1) and eomonth(a.tenancy_start_date,0) = a.snapshot_date;

/* Look at duplicates again */
drop table if exists #house_dups;
select 
house_id
,snapshot_date
,count(*) as n
into #house_dups
from #temp10
group by house_id,snapshot_date
having count(*) > 1
order by count(*) desc;

drop table if exists #house_dups1;
select 
a.*
,b.n
into #house_dups1
from #temp10 a
left join #house_dups b on a.house_id = b.house_id and a.snapshot_date = b.snapshot_date
where b.house_id is not null;

drop table if exists #house_dups2;
select 
a.* 
into #house_dups2
from #house_dups1 a
left join (select 
           house_id
		   ,snapshot_date 
		   from #house_dups1 
		   where eomonth(tenancy_start_date,0) = snapshot_date
		   ) b on a.house_id = b.house_id and a.snapshot_date = b.snapshot_date
where b.house_id is null;

/* Dedup anyone that is not on spine where there is someone with same dob and on spine previously */
drop table if exists #change_primary;
select 
b.snz_uid as prev_prim
,a.*
into #change_primary
from #temp10 a
left join #temp10 b on a.household_id = b.household_id and a.snapshot_date = eomonth(b.snapshot_date,1)
where b.snz_uid != a.snz_uid;

drop table if exists #change_primary1;
select 
a.prev_prim
,a.snz_uid
,a.household_id
,b.snz_sex_gender_code as sex
,c.snz_sex_gender_code as prev_sex
,datefromparts(b.snz_birth_year_nbr, b.snz_birth_month_nbr, 15) as dob
,datefromparts(c.snz_birth_year_nbr, c.snz_birth_month_nbr, 15) as prev_dob
,b.snz_spine_ind as spine
,c.snz_spine_ind as prev_spine
into #change_primary1
from #change_primary a
left join [$(idicleanversion)].[data].[personal_detail] b on a.snz_uid = b.snz_uid
left join [$(idicleanversion)].[data].[personal_detail] c on a.prev_prim = c.snz_uid;

drop table if exists #change_primary2;
select 
*
into #change_primary2
from #change_primary1
where convert(int,spine) + convert(int,prev_spine) = 1 and dob = prev_dob and sex = prev_sex;

drop table if exists #change_primary3;
select 
iif(spine = 1,prev_prim,snz_uid) as old_id
,iif(spine = 1,snz_uid,prev_prim) as new_id
into #change_primary3
from #change_primary2;

drop table if exists #temp11;
select distinct
coalesce(b.new_id,a.snz_uid) as snz_uid,	
a.snz_msd_uid, 
a.household_id,
a.house_id,
a.snapshot_date,
a.tenancy_start_date,
a.social_house_entry_date,
a.hshd_size,
a.hshd_type,
a.hshd_weekly_income,
a.inc_related_rent,
a.inc_reltd_rent_subsdy,
a.market_rent,
a.bedroom_required_count,
a.provider_name,
a.snz_idi_address_register_uid,
a.meshblock_code,
a.adj_ind,
a.exit_date,
a.exit_type
into #temp11
from #temp10 a
left join #change_primary3 b on a.snz_uid = b.old_id;

/* There are some obvious cases where across the transition from HNZ to MSD they have moved to a new house but kept same hh. Similarly the CHP house id keeps moving around a bit so tidy this up, some houses switch to a wrong id for a few months then switch back - need to adjust for these */
drop table if exists #rematch_houses;
select 
b.house_id as prev_house_id
,c.house_id as hnz_house_id
,a.house_id
,a.household_id
,a.snapshot_date
into #rematch_houses
from #temp11 a
left join #temp11 b on a.household_id = b.household_id and eomonth(b.snapshot_date,1)= a.snapshot_date 
left join #temp11 c on a.household_id = c.household_id and c.snapshot_date = '2015-08-31'
where a.snapshot_date >= '2015-09-30' and b.house_id != a.house_id
order by household_id,snapshot_date;

/* Count number of switches */
drop table if exists #rematch_houses_n;
select 
a.household_id
,count(*) as n
,min(snapshot_date) as first_switch
into #rematch_houses_n
from #rematch_houses a
group by household_id
having count(*) > 1;

/* Look at those with more than 1 switch */
drop table if exists #rematch_houses1;
select 
a.*
,b.n
,b.first_switch
into #rematch_houses1
from #rematch_houses a
left join #rematch_houses_n b on a.household_id = b.household_id;

/* Join on first house_id when first switched */
drop table if exists #rematch_houses2;
select 
a.*
,b.prev_house_id as first_switch_house_id  
into #rematch_houses2
from #rematch_houses1 a
left join #rematch_houses1 b on a.first_switch = b.snapshot_date and a.household_id = b.household_id and a.first_switch is not null;

/* Determine final ids */
drop table if exists #rematch_houses3;
select distinct 
house_id
,household_id
,coalesce(first_switch_house_id,prev_house_id) as new_house_id
into #rematch_houses3
from #rematch_houses2
where house_id != coalesce(first_switch_house_id,prev_house_id);

/* Merge onto table overall */
drop table if exists #temp12;
 select distinct
a.snz_uid,	
a.snz_msd_uid, 
a.household_id,
coalesce(b.new_house_id,a.house_id) as house_id,
a.snapshot_date,
a.tenancy_start_date,
a.social_house_entry_date,
a.hshd_size,
a.hshd_type,
a.hshd_weekly_income,
a.inc_related_rent,
a.inc_reltd_rent_subsdy,
a.market_rent,
a.bedroom_required_count,
a.provider_name,
a.snz_idi_address_register_uid,
a.meshblock_code,
a.adj_ind,
a.exit_date,
a.exit_type
into #temp12
from #temp11 a
left join #rematch_houses3 b on a.household_id = b.household_id and a.house_id = b.house_id;

/* Duplicate snz_uid mean that we can not create a primary key */
drop table if exists #tenancy_snapshot_clean_$(YYYYMM);
create table #tenancy_snapshot_clean_$(YYYYMM) (
snz_uid int null,
household_id int not null,
house_id int null,
link_set_key smallint not null,
snapshot_date date not null,
tenancy_start_date date null,
social_house_entry_date date null,
hshd_size smallint null,
hshd_type varchar(100) null,
hshd_weekly_income float null,
inc_related_rent float null,
inc_reltd_rent_subsdy float null,
market_rent float null,
bedroom_required_count int null,
provider_name varchar(5) null,
snz_idi_address_register_uid int null,
meshblock_code varchar(7) null,
adj_ind tinyint not null,
exit_date date null,
exit_type varchar(8) null,
constraint PK_ts_clean primary key clustered (link_set_key asc, household_id asc, snapshot_date asc)
);

set @link_set = (select top 1 link_set_key from [$(idicleanversion)].[data].[personal_detail])
insert into #tenancy_snapshot_clean_$(YYYYMM)

select 
snz_uid,	
household_id,
house_id,
@link_set as link_set_key,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
snz_idi_address_register_uid,
meshblock_code,
adj_ind,
exit_date,
exit_type
from #temp12;

/* Tidy up tenancy household tables - Add missing records from pre-2010, assign all households a single household id, add back children dropped from latest refresh and match records across HNZ to MSD data migratation and assign a single snz_uid */
/* Link records from sandpit pre-2010 */
drop table if exists #pi_test;
SELECT 
*
into #pi_test
FROM [IDI_Adhoc].[clean_read_HNZ].[adhoc_clean_tenancy_h_snapshot]
where hnz_ths_snapshot_date <= '2010-10-31';

/* Using old concordance table merge using census/ird indicators the most recent msd number */
drop table if exists #better_linking;
select 
c.snz_cen_uid as cen_uid
,d.snz_ird_uid as ird_uid
,c.snz_msd_uid as cen_msd_uid
,d.snz_msd_uid as ird_msd_uid
,c.snz_uid as cen_snz_uid
,d.snz_uid as ird_snz_uid
,e.snz_uid as msd_snz_uid
,case when c.snz_spine_uid is not null then 1 else 0 end as cen_spine_ind
,case when d.snz_spine_uid is not null then 1 else 0 end as ird_spine_ind
,case when e.snz_spine_uid is not null then 1 else 0 end as msd_spine_ind
,coalesce(c.link_set_key,d.link_set_key,e.link_set_key) as link_set_key
,a.*
into #better_linking
from #pi_test a
inner join [$(idicleanversion)].[security].[concordance] b on a.snz_msd_uid = b.[snz_msd_uid]
left join [$(idicleanversion)].[security].[concordance] c on b.snz_cen_uid = c.snz_cen_uid and c.snz_cen_uid is not null  and b.snz_cen_uid is not null
left join [$(idicleanversion)].[security].[concordance] d on b.snz_ird_uid = d.snz_ird_uid and d.snz_ird_uid is not null  and b.snz_ird_uid is not null
left join [$(idicleanversion)].[security].[concordance] e on a.snz_msd_uid = e.snz_msd_uid and e.snz_msd_uid is not null;

/* Pick the right snz/msd uid's to use based on latest data */
drop table if exists #household_rematched;
select 
case when msd_spine_ind = 1 then msd_snz_uid
	when ird_spine_ind = 1 then ird_snz_uid
	when cen_spine_ind = 1 then cen_snz_uid
	when msd_snz_uid is not null then msd_snz_uid
	when ird_snz_uid is not null then ird_snz_uid
	when cen_snz_uid is not null then cen_snz_uid
	end as snz_uid
,link_set_key
,case 
	when msd_spine_ind = 1 then snz_msd_uid
	when ird_spine_ind = 1 then ird_msd_uid
	when cen_spine_ind = 1 then cen_msd_uid
	when msd_snz_uid is not null then snz_msd_uid
	when ird_snz_uid is not null then ird_msd_uid
	when cen_snz_uid is not null then cen_msd_uid
	end as snz_msd_uid
,snz_msd_uid as orig_msd_uid
,hnz_ths_snapshot_date
,snz_household_uid
,snz_legacy_household_uid
,hnz_ths_app_relship_text
,hnz_ths_birth_month_nbr
,hnz_ths_birth_year_nbr
,hnz_ths_sex_snz_code
,hnz_ths_ethnicity_text
,hnz_ths_signatory_flg_ind
,hnz_ths_ethnicity1_code
into #household_rematched
from #better_linking;

/* Pull out missing ids and see if we can match them better */
drop table if exists #match_again;
select 
*
into #match_again
from  #household_rematched
where snz_msd_uid is null

/* Match on using moe/dia variables */
drop table if exists #match_again1;
select 
c.snz_moe_uid as moe_uid
,c.snz_msd_uid as moe_msd_uid
,c.snz_uid as moe_snz_uid
,d.snz_dia_uid as dia_uid
,d.snz_msd_uid as dia_msd_uid
,d.snz_uid as dia_snz_uid
,case when c.snz_spine_uid is not null then 1 else 0 end as moe_spine_ind
,case when d.snz_spine_uid is not null then 1 else 0 end as dia_spine_ind
,a.*
into #match_again1
from #match_again a
inner join [$(idicleanversion)].[security].[concordance] b on a.orig_msd_uid = b.snz_msd_uid
left join [$(idicleanversion)].[security].[concordance] c on b.snz_moe_uid = c.snz_moe_uid and c.snz_moe_uid is not null
left join [$(idicleanversion)].[security].[concordance] d on b.snz_dia_uid = d.snz_dia_uid and d.snz_dia_uid is not null;

/* Create another matched dataset with moe/dia used */
drop table if exists #household_rematched1;
select 
case 
	when moe_spine_ind = 1 then moe_snz_uid
	when dia_spine_ind = 1 then dia_snz_uid
	when moe_snz_uid is not null then moe_snz_uid
	when dia_snz_uid is not null then dia_snz_uid
	end as snz_uid
,link_set_key
,case 
	when moe_spine_ind = 1 then moe_msd_uid
	when dia_spine_ind = 1 then dia_msd_uid
	when moe_msd_uid is not null then moe_msd_uid
	when dia_msd_uid is not null then dia_msd_uid
	end as snz_msd_uid
,orig_msd_uid
,hnz_ths_snapshot_date
,snz_household_uid
,snz_legacy_household_uid
,hnz_ths_app_relship_text
,hnz_ths_birth_month_nbr
,hnz_ths_birth_year_nbr
,hnz_ths_sex_snz_code
,hnz_ths_ethnicity_text
,hnz_ths_signatory_flg_ind
,hnz_ths_ethnicity1_code
into #household_rematched1
from #match_again1;

/* Go another round of matching by matching main household dataset using dob and sex */
drop table if exists #match_again_again;
select 
*
,row_number() over (partition by [hnz_ths_snapshot_date],[snz_legacy_household_uid],[hnz_ths_birth_month_nbr],hnz_ths_birth_year_nbr,hnz_ths_sex_snz_code
			        order by [hnz_ths_snapshot_date],[snz_legacy_household_uid],[hnz_ths_birth_month_nbr],hnz_ths_birth_year_nbr,hnz_ths_sex_snz_code) as dob_num
into #match_again_again
from  #household_rematched1
where snz_msd_uid is null;

drop table if exists #match_again_again1;
select distinct 
b.snz_msd_uid as hh_msd_uid
,b.snz_uid as hh_snz_uid
,a.*
into #match_again_again1
from #match_again_again a
left join (select 
           x.*
		   ,row_number() over (partition by [snz_legacy_household_uid],[hnz_ths_birth_month_nbr],hnz_ths_birth_year_nbr,[hnz_ths_gender_snz_code]
			                   order by [snz_legacy_household_uid],[hnz_ths_birth_month_nbr],hnz_ths_birth_year_nbr,[hnz_ths_gender_snz_code]) as dob_num
		   from (select distinct 
				  snz_uid
				  ,snz_msd_uid
				  ,snz_legacy_household_uid
				  ,hnz_ths_birth_month_nbr
				  ,hnz_ths_birth_year_nbr
				  ,[hnz_ths_gender_snz_code] 
				  from [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot]
				  ) x
		   ) b
on a.snz_legacy_household_uid=b.snz_legacy_household_uid 
and a.hnz_ths_birth_month_nbr=b.hnz_ths_birth_month_nbr
and a.hnz_ths_birth_year_nbr=b.hnz_ths_birth_year_nbr 
and a.hnz_ths_sex_snz_code = b.[hnz_ths_gender_snz_code]
and b.snz_legacy_household_uid is not null 
and b.dob_num = a.dob_num;

/* Format for appending - where we couldn't find a msd_uid take a negative of the original one */
drop table if exists #household_rematched2;
select  
case when hh_snz_uid is not null then hh_snz_uid else -orig_msd_uid end as snz_uid
,link_set_key
,case when hh_msd_uid is not null then hh_msd_uid else -orig_msd_uid end as snz_msd_uid
,orig_msd_uid
,hnz_ths_snapshot_date
,snz_household_uid
,snz_legacy_household_uid
,hnz_ths_app_relship_text
,hnz_ths_birth_month_nbr
,hnz_ths_birth_year_nbr
,hnz_ths_sex_snz_code
,hnz_ths_ethnicity_text
,hnz_ths_signatory_flg_ind
,hnz_ths_ethnicity1_code
into #household_rematched2
from #match_again_again1;

/* Combine all the matches into one dataset */
drop table if exists #tenancy_household_old_rematch;
select * into #tenancy_household_old_rematch from (select * from #household_rematched where snz_msd_uid is not null) a
union all 
select * from (select * from #household_rematched1 where snz_msd_uid is not null) b
union all 
select * from (select * from #household_rematched2 where snz_msd_uid is not null) c;

/* HNZ data - Read in HNZ household data in the IDI */
drop table if exists #hnz_1;
select 
snz_uid,
snz_msd_uid,
household_id,
convert(date, hnz_ths_snapshot_date) as snapshot_date,
hnz_ths_app_relship_text as app_relship,
hnz_ths_signatory_flg_ind as signatory_flag_ind,
case hnz_ths_gender_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(hnz_ths_birth_year_nbr, hnz_ths_birth_month_nbr, 15) as dob
into #hnz_1
from [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot] a
left join #tenancy_household_id_$(YYYYMM) b on a.snz_household_uid = b.snz_household_uid and a.snz_msd_application_uid is null
where b.snz_household_uid is not null
union all
select 
snz_uid,
snz_msd_uid,
household_id,
convert(date, hnz_ths_snapshot_date) as snapshot_date,
hnz_ths_app_relship_text as app_relship,
hnz_ths_signatory_flg_ind as signatory_flag_ind,
case hnz_ths_gender_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(hnz_ths_birth_year_nbr, hnz_ths_birth_month_nbr, 15) as dob
from [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot] c
left join #tenancy_household_id_$(YYYYMM) d on c.snz_legacy_household_uid = d.snz_legacy_household_uid 
where c.snz_legacy_household_uid is not null and c.snz_household_uid is null;

/* Get households that need filling in */
drop table if exists #hh_miss1;
select 
household_id, 
max(case when snapshot_date < '2010-11-30' then snapshot_date else '1900-01-01' end) as fill_from,
max(case when snapshot_date = '2010-11-30' then 1 else 0 end) as present_end
into #hh_miss1
from #hnz_1
where snapshot_date <= '2010-11-30' and household_id is not null
group by household_id;

/* Get list of snapshot dates to fill in */
drop table if exists #hh_miss2;
select 
a.household_id, 
a.snapshot_date, 
iif(b.present_end = 1, datefromparts(2010, 11, 30), b.fill_from) as imp_date
into #hh_miss2
from #tenancy_snapshot_clean_$(YYYYMM) a
inner join #hh_miss1 b on a.household_id = b.household_id
where a.snapshot_date <= '2010-10-31' and (b.fill_from = '1900-01-01' or a.snapshot_date > b.fill_from); 

/* Expand to include all possible household members */
drop table if exists #hh_miss3;
select 
b.snz_uid,
b.snz_msd_uid,
a.household_id,
a.snapshot_date,
b.app_relship,
b.signatory_flag_ind,
b.sex,
b.dob
into #hh_miss3
from #hh_miss2 a
inner join #hnz_1 b on a.household_id = b.household_id and a.imp_date = b.snapshot_date
where a.imp_date is not null;

/* Keep records found in adhoc data */
drop table if exists #hh_miss4;
select 
a.* 
into #hh_miss4
from #hh_miss3 a
inner join (select distinct 
            snz_uid
			,convert(date, hnz_ths_snapshot_date) as snapshot_date 
	        from #tenancy_household_old_rematch
	        where hnz_ths_snapshot_date <= '2010-10-31'
            ) b on a.snz_uid = b.snz_uid and a.snapshot_date = b.snapshot_date;

/* Add records into main table */
drop table if exists #hnz_2;
select * into #hnz_2 from #hnz_1
union all
select * from #hh_miss4;

/* Add in all extra records from adhoc dataset */
drop table if exists #unmatched_sand;
select 
a.snz_uid,
snz_msd_uid,
null as household_id,
convert(date, a.hnz_ths_snapshot_date) as snapshot_date,
hnz_ths_app_relship_text as app_relship,
hnz_ths_signatory_flg_ind as signatory_flag_ind,
case hnz_ths_sex_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(hnz_ths_birth_year_nbr, hnz_ths_birth_month_nbr, 15) as dob
into #unmatched_sand
from #tenancy_household_old_rematch a
left join (select distinct snz_uid, snapshot_date from #hnz_2) b on a.snz_uid = b.snz_uid and a.hnz_ths_snapshot_date = b.snapshot_date
where b.snz_uid is null;

drop table if exists #hh_hnz;
select * into #hh_hnz from #hnz_2
union all
select * from #unmatched_sand;

/* MSD data - Match MSD household id */
drop table if exists #mapping_not_distinct;
select 
snz_msd_application_uid
into #mapping_not_distinct
from #tenancy_household_id_$(YYYYMM)
where snz_msd_application_uid is not null
group by snz_msd_application_uid
having count(*) > 1;

/* First extract households where snz_msd_application_uid has 1 - 1 mapping with HNZ house */
drop table if exists #hh_mapping_clean;
select a.*
into #hh_mapping_clean
from #tenancy_household_id_$(YYYYMM) a
left join #mapping_not_distinct b on a.snz_msd_application_uid = b.snz_msd_application_uid and a.snz_msd_application_uid is not null
where b.snz_msd_application_uid is null;

drop table if exists #hh_msd_clean;
select 
a.snz_uid,
a.snz_msd_uid,
b.household_id,
convert(date, a.hnz_ths_snapshot_date) as snapshot_date,
a.hnz_ths_app_relship_text as app_relship,	
a.hnz_ths_signatory_flg_ind as signatory_flag_ind,
case a.hnz_ths_gender_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(a.hnz_ths_birth_year_nbr, a.hnz_ths_birth_month_nbr, 15) as dob
into #hh_msd_clean
from [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot] a
inner join #hh_mapping_clean b on a.snz_msd_application_uid = b.snz_msd_application_uid
where a.hnz_ths_snapshot_date > '2015-08-31';

/* Combined MSD - HNZ data */
drop table if exists #hh1;
select * into #hh1 from #hh_hnz
union all
select * from #hh_msd_clean;

/* Fill in missing households, these are people who for what ever reason, are not in the current refresh. Go back to last time and get them. */
drop table if exists #last_rfrsh_mssng;
select 
snz_msd_uid,
household_id,
convert(date, hnz_ths_snapshot_date) as snapshot_date,
hnz_ths_app_relship_text as app_relship,
hnz_ths_signatory_flg_ind as signatory_flag_ind,
case hnz_ths_gender_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(hnz_ths_birth_year_nbr, hnz_ths_birth_month_nbr, 15) as dob
into #last_rfrsh_mssng
from [IDI_Clean_202310].[hnz_clean].[tenancy_household_snapshot] c
left join #tenancy_household_id_$(YYYYMM) d on c.snz_msd_application_uid = d.snz_msd_application_uid 
where c.snz_msd_application_uid is not null 
and hnz_ths_birth_year_nbr =1999;
 
/* Link these back to current refresh, take out duped snapshot dates */
drop table if exists #last_rfrsh_mssng2;
select 
b.snz_uid,
a.snz_msd_uid,
a.household_id,
a.snapshot_date,
a.app_relship,
a.signatory_flag_ind,
a.sex,
a.dob
into #last_rfrsh_mssng2
from #last_rfrsh_mssng a 
left join [$(idicleanversion)].[security].[concordance] b on a.snz_msd_uid=b.snz_msd_uid
where b.snz_uid is not null and snapshot_date >= '2015-09-30' ;

drop table if exists #last_rfrsh_mssng3;
select 
a.*
into #last_rfrsh_mssng3
from #last_rfrsh_mssng2 a
inner join (select distinct 
            snz_uid
	        ,snapshot_date 
	        from #last_rfrsh_mssng2
	        ) b on a.snz_uid = b.snz_uid and a.snapshot_date = b.snapshot_date;

/* Add in children dropped from the latest refresh, records where there is a 1 - 1 mapping between msd and hnz household identifiers */
drop table if exists #missing_children1;
select 
-1 as snz_uid,
-1 as snz_msd_uid,
b.household_id,
convert(date, a.hnz_ths_snapshot_date) as snapshot_date,
a.hnz_ths_app_relship_text as app_relship,
a.hnz_ths_signatory_flg_ind as signatory_flag_ind,
case a.hnz_ths_gender_snz_code when 1 then 'M' when 2 then 'F' else 'X' end as sex,
datefromparts(a.hnz_ths_birth_year_nbr, a.hnz_ths_birth_month_nbr, 15) as dob
into #missing_children1
from [$(idicleanversion)].[hnz_clean].[tenancy_household_snapshot] a
inner join #hh_mapping_clean b on a.snz_msd_application_uid = b.snz_msd_application_uid
where a.hnz_ths_snapshot_date > '2015-08-31';

/* Delete any records where we can find person with same household, dob and gender in current data */
drop table if exists #missing_children2;
select 
a.* 
into #missing_children2
from #missing_children1 a
left join #hh1 b on a.household_id = b.household_id and a.snapshot_date = b.snapshot_date and a.sex = b.sex and a.dob = b.dob 
where b.snz_uid is null;

/* Add to household data */
drop table if exists #hh2;
select * into #hh2 from #hh1
union all
select * from #missing_children2;

/* Remove missing people who are in main dataset */
drop table if exists #last_rfrsh_mssng4;
select 
a.*
into #last_rfrsh_mssng4
from #last_rfrsh_mssng3 a
left join #hh2 b on a.snapshot_date = b.snapshot_date and a.household_id = b.household_id and a.dob = b.dob and a.sex = b.sex
where b.snz_uid is null;

drop table if exists #hh2_add_msg;
select * into #hh2_add_msg from #hh2
union all
select * from  #last_rfrsh_mssng4;

/* Match household members across data migration - Merge on spine indicator and primary tenant */
drop table if exists #hh3;
select 
a.*, 
b.snz_spine_ind, 
c.snz_uid as primary_uid,
/* dob ID allows us to distinguish between members of household with same sex + dob eg twins */
row_number() over (partition by a.snapshot_date, a.household_id, a.sex, a.dob order by a.snz_uid) as dob_id
into #hh3
from #hh2_add_msg a
left join [$(idicleanversion)].[data].[personal_detail] b on a.snz_uid = b.snz_uid
left join #tenancy_snapshot_clean_$(YYYYMM) c on a.household_id = c.household_id and a.snapshot_date = c.snapshot_date;

/* Extract all records at end of HNZ data */
drop table if exists #hnz_end;
select 
* 
into #hnz_end 
from #hh3 
where snapshot_date = '2015-08-31';

/* Extract all records at start of MSD data */
drop table if exists #msd_start;
select 
* 
into #msd_start 
from #hh3 
where snapshot_date = '2015-09-30';

/* Match records across data migration using household ID + sex + dob */
drop table if exists #prioritised_source;
select 
h.snz_uid as hnz_snz_uid,
m.snz_uid as msd_snz_uid,
h.household_id,
h.sex,
h.dob,
h.dob_id,
m.primary_uid as msd_primary_uid,
case 
	/* Prioritise records on the spine over records not on the spine */
	when h.snz_spine_ind = 1 and m.snz_spine_ind = 0 then 'HNZ'
	when h.snz_spine_ind = 0 and m.snz_spine_ind = 1 then 'MSD'
	/* Prioritise primary records over records that are not primary */
	when h.primary_uid = h.snz_uid and m.primary_uid != m.snz_uid then 'HNZ'
	when h.primary_uid != h.snz_uid and m.primary_uid = m.snz_uid then 'MSD'
	/* Otherwise default to HNZ records as linking is generally better here */
	else 'HNZ'
end as priority_source
into #prioritised_source
from #hnz_end h
inner join #msd_start m on h.household_id = m.household_id and h.sex = m.sex and h.dob = m.dob and h.dob_id = m.dob_id
where h.snz_uid != m.snz_uid;

drop table if exists #id_mapping;
select 
iif(priority_source = 'HNZ', msd_snz_uid, hnz_snz_uid) as from_snz_uid,
iif(priority_source = 'HNZ', hnz_snz_uid, msd_snz_uid) as to_snz_uid,
household_id,
sex,
dob,
dob_id,
msd_primary_uid
into #id_mapping
from #prioritised_source;

/* On first linking pass match on household */
drop table if exists #hh4;
select 
coalesce(b.to_snz_uid, a.snz_uid) as snz_uid,
a.snz_uid as raw_snz_uid,
a.household_id,
a.snapshot_date,
a.app_relship,
a.signatory_flag_ind,
a.sex,
a.dob,
a.primary_uid
into #hh4
from #hh3 a
left join #id_mapping b on a.household_id = b.household_id and a.sex = b.sex and a.dob = b.dob and a.dob_id = b.dob_id;

/* On second linking pass match through primary tenant */
drop table if exists #id_mapping_pt;
select distinct 
snz_uid
,raw_snz_uid
,primary_uid
,sex
,dob
into #id_mapping_pt
from #hh4
where snapshot_date > '2015-08-31' and snz_uid != raw_snz_uid;

/* Check for records where mapping is not 1-1 */
drop table if exists #id_mapping_pt_dup;
select 
raw_snz_uid
,primary_uid
,sex
,dob
into #id_mapping_pt_dup
from #id_mapping_pt
group by raw_snz_uid,primary_uid,sex,dob
having count(*) > 1;

/* Identify records on spine and last snapshot date */
drop table if exists #id_mapping_pt_dup2;
select
b.snz_uid, 
b.raw_snz_uid, 
b.primary_uid, 
b.sex, 
b.dob, 
c.snz_spine_ind,
max(b.snapshot_date) as max_snapshot_date
into #id_mapping_pt_dup2
from #id_mapping_pt_dup a
inner join #hh4 b on a.raw_snz_uid = b.raw_snz_uid and a.primary_uid = b.primary_uid and a.sex = b.sex and a.dob = b.dob
inner join [$(idicleanversion)].[data].[personal_detail] c on b.snz_uid = c.snz_uid 
group by b.snz_uid,b.raw_snz_uid,b.primary_uid,b.sex,b.dob,c.snz_spine_ind;

/* Priotise records on the spine and then latest observation */
drop table if exists #id_mapping_pt_dup3;
select 
*
,row_number() over (partition by raw_snz_uid, primary_uid, sex, dob order by snz_spine_ind desc, max_snapshot_date desc) as rnk
into #id_mapping_pt_dup3
from #id_mapping_pt_dup2 
order by raw_snz_uid, snz_spine_ind desc;

/* De-duplicate */
drop table if exists #id_mapping_pt_dedup;
select 
a.*
into #id_mapping_pt_dedup
from #id_mapping_pt a
left join #id_mapping_pt_dup3 b on a.raw_snz_uid = b.raw_snz_uid and a.snz_uid = b.snz_uid and a.primary_uid = b.primary_uid and a.sex = b.sex and a.dob = b.dob
where b.snz_uid is null or b.rnk = 1;

/* Update snz_uid based on primary household member */
drop table if exists #hh5;
select 
case 
	when a.snapshot_date > '2015-08-31' and b.snz_uid is not null then b.snz_uid
	else a.snz_uid
end as snz_uid,
a.snz_uid as raw_snz_uid,
a.household_id,
a.snapshot_date,
a.app_relship,
a.signatory_flag_ind,
a.sex,
a.dob,
a.primary_uid
into #hh5
from #hh4 a
left join #id_mapping_pt_dedup b on a.primary_uid = b.primary_uid and a.raw_snz_uid = b.raw_snz_uid and a.sex = b.sex and a.dob = b.dob; 
		
/* Identify duplicates */
/* First duplicates in same household */
drop table if exists #hh_dups;
select 
a.*
,count_hh_date 
into #hh_dups
from #hh5 a
inner join (select 
            snz_uid
			,household_id
			,snapshot_date
			,count(*) as count_hh_date 
			from #hh5 
			group by snz_uid, household_id, snapshot_date 
			having count(*) > 1
            )  b
on a.snz_uid = b.snz_uid 
and a.household_id = b.household_id 
and a.snapshot_date = b.snapshot_date;

/* Transpose data into one row per person per date */
drop table if exists #hh_dups1prep;
select 
snz_uid
,household_id
,snapshot_date
,case when rownum = 1 then dob else '1900-01-01' end as dob_1
,case when rownum = 1 and sex='M' then 1 when rownum=1 and sex='F' then 2 else 0 end as sex_1
,case when rownum = 2 then dob else '1900-01-01' end as dob_2
,case when rownum = 2 and sex='M' then 1 when rownum=2 and sex='F' then 2 else 0 end as sex_2
,case when rownum = 3 then dob else '1900-01-01' end as dob_3
,case when rownum = 3 and sex='M' then 1 when rownum=3 and sex='F' then 2 else 0 end as sex_3
,case when rownum = 4 then dob else '1900-01-01' end as dob_4
,case when rownum = 4 and sex='M' then 1 when rownum=4 and sex='F' then 2 else 0 end as sex_4
into #hh_dups1prep
from (select 
      *
	  ,row_number() over (partition by snz_uid, household_id, snapshot_date order by snz_uid, household_id, snapshot_date,dob,sex) as rownum 
	  from #hh_dups 
	  where snz_uid <> -1
     ) a
;

drop table if exists #hh_dups1;
select 
snz_uid
,household_id
,snapshot_date
,count(*) as [count]
,max(isnull(dob_1,'1900-01-01')) as dob_1
,sum(sex_1) as sex_1
,max(isnull(dob_2,'1900-01-01')) as dob_2
,sum(sex_2) as sex_2
,max(isnull(dob_3,'1900-01-01')) as dob_3
,sum(sex_3) as sex_3
,max(isnull(dob_4,'1900-01-01')) as dob_4
,sum(sex_4) as sex_4
into #hh_dups1
from #hh_dups1prep
group by snz_uid,household_id,snapshot_date;

/* Summarise data into one row per person */
drop table if exists #hh_dups2;
select 
snz_uid
,[count]
,dob_1
,sex_1
,dob_2
,sex_2
,dob_3
,sex_3
,dob_4
,sex_4
,count(*) as num_dates
into #hh_dups2
from #hh_dups1
group by snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4
order by snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4;

/* Merge on ird numbers from swn table */
drop table if exists #hh_dups3;
select 
a.*
,max(isnull(b.msd_swn_snz_unique_nbr,-1)) as msd_swn_num
,max(case when a.dob_1 = b.dob and a.sex_1 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_1
,max(case when a.dob_2 = b.dob and a.sex_2 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_2
,max(case when a.dob_3 = b.dob and a.sex_3 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_3
,max(case when a.dob_4 = b.dob and a.sex_4 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_4
into #hh_dups3
from #hh_dups2 a
left join (select 
           snz_uid
		   ,msd_swn_snz_unique_nbr
		   ,datefromparts(msd_swn_birth_year_nbr, msd_swn_birth_month_nbr, 15) as dob
		   ,msd_swn_gender_snz_code as sex
		   ,snz_ird_uid
			from [$(idicleanversion)].[msd_clean].[msd_swn] 
			) b on a.snz_uid = b.snz_uid
group by a.snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4,num_dates;

/* Calculate unique IRD numbers and unique number of birthdays per person and look at any partner birthdate */
drop table if exists #hh_dups4;
select 
*
,case when snz_ird_uid_1 <> -1 and snz_ird_uid_1 > 0 
	then 1 else 0 end as ird_uni_1
,case when snz_ird_uid_2 <> -1  and snz_ird_uid_2 > 0 and 
	(snz_ird_uid_2 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) 
	then 1 else 0  end as ird_uni_2
,case when snz_ird_uid_3 <> -1  and snz_ird_uid_3 > 0 and 
	((snz_ird_uid_3 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) and (snz_ird_uid_3 <> snz_ird_uid_2 or snz_ird_uid_2 = -1)) 
	then 1 else 0  end as ird_uni_3
,case when snz_ird_uid_4 <> -1  and snz_ird_uid_4 > 0 and 
	((snz_ird_uid_4 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) and (snz_ird_uid_4 <> snz_ird_uid_2 or snz_ird_uid_2 = -1) and (snz_ird_uid_4 <> snz_ird_uid_3 or snz_ird_uid_3 = -1)) 
	then 1 else 0  end as ird_uni_4	
,case when dob_1 is not null and dob_1 <> '1900-01-01'  then 1 else 0 end as unique_bday1
,case when dob_2 is not null and dob_2 <> '1900-01-01' and 
	(dob_2 <> dob_1 or sex_2 <> sex_1) then 1 else 0  end as unique_bday2
,case when dob_3 is not null and dob_3 <> '1900-01-01'  and 
	((dob_3 <> dob_1 or sex_3 <> sex_1) and (dob_3 <> dob_2 or sex_3 <> sex_2)) then 1 else 0  end as unique_bday3
,case when dob_4 is not null and dob_4 <> '1900-01-01'  and 
	((dob_4 <> dob_1 or sex_4 <> sex_1) and (dob_4 <> dob_2 or sex_4 <> sex_2) and (dob_4 <> dob_3 or sex_4 <> sex_3)) then 1 else 0  end as unique_bday4
into #hh_dups4
from #hh_dups3;

/* Find those with multiple ird number entries */
drop table if exists #hh_dups5;
select 
*
into #hh_dups5
from #hh_dups4
where ird_uni_1+ird_uni_2+ird_uni_3+ird_uni_4 > 1;

/* Convert to long dataset */
drop table if exists #hh_dups6;
select 
snz_uid,dob_1 as dob
,case when sex_1 = 1 then 'M' when sex_1 = 2 then 'F' else 'X' end as sex
,snz_ird_uid_1 as snz_ird_uid
into #hh_dups6
from #hh_dups5
where ird_uni_1 = 1
union
select snz_uid,dob_2 as dob, case when sex_2 = 1 then 'M' when sex_2 = 2 then 'F' else 'X' end as sex,snz_ird_uid_2 as snz_ird_uid
from #hh_dups5
where ird_uni_2 = 1
union
select snz_uid,dob_3 as dob, case when sex_3 = 1 then 'M' when sex_3 = 2 then 'F' else 'X' end as sex,snz_ird_uid_3 as snz_ird_uid
from #hh_dups5
where ird_uni_3 = 1
union
select snz_uid,dob_4 as dob, case when sex_4 = 1 then 'M' when sex_4 = 2 then 'F' else 'X' end as sex,snz_ird_uid_4 as snz_ird_uid
from #hh_dups5
where ird_uni_4 = 1;

/* Add on snz_uid from ird number */
drop table if exists #hh_dups7;
select 
a.*
,b.snz_uid as new_snz_uid
into #hh_dups7
from #hh_dups6 a
left join [$(idicleanversion)].[security].[concordance] b on a.snz_ird_uid = b.snz_ird_uid;

/* Find those with multiple birthdays entries */
drop table if exists #hh_dups8;
select 
*
,unique_bday1+unique_bday2+unique_bday3+unique_bday4 as num_bdays
into #hh_dups8
from #hh_dups4
where unique_bday1+unique_bday2+unique_bday4+unique_bday4 > 1 and ird_uni_1+ird_uni_2+ird_uni_3+ird_uni_4 <= 1;

/* Work out which birthday has the correct identifier and also bring through partner information with a check on dob/sex including fellow birth parents */
drop table if exists #hh_dups9;
select 
a.snz_uid
,dob_1
,sex_1
,dob_2
,sex_2
,dob_3
,sex_3
,dob_4
,sex_4
,max(case when dob_1 = b.dob and sex_1 = b.snz_sex_gender_code and unique_bday1= 1 then 2 when unique_bday1= 1 then 1 else 0 end) as Match_1 
,max(case when dob_2 = b.dob and sex_2 = b.snz_sex_gender_code and unique_bday2= 1 then 2 when unique_bday2= 1 then 1 else 0  end) as Match_2 
,max(case when dob_3 = b.dob and sex_3 = b.snz_sex_gender_code and unique_bday3= 1 then 2 when unique_bday3= 1 then 1 else 0  end) as Match_3 
,max(case when dob_4 = b.dob and sex_4 = b.snz_sex_gender_code and unique_bday4= 1 then 2 when unique_bday4= 1 then 1 else 0  end) as Match_4 
,max(case when dob_1 = d.dob and sex_1 = d.snz_sex_gender_code and c.snz_associated_uid <> a.snz_uid then c.snz_associated_uid 
	 when dob_1 = f.dob and sex_1 = f.snz_sex_gender_code and f.snz_uid <> a.snz_uid then f.snz_uid 
	 when dob_1 = h.dob and sex_1 = h.snz_sex_gender_code and h.snz_uid <> a.snz_uid then h.snz_uid  else -1 end) as assoc_uid_1
,max(case when dob_2 = d.dob and sex_2 = d.snz_sex_gender_code and c.snz_associated_uid <> a.snz_uid then c.snz_associated_uid 
	 when dob_2 = f.dob and sex_2 = f.snz_sex_gender_code and f.snz_uid <> a.snz_uid then f.snz_uid 
	 when dob_2 = h.dob and sex_2 = h.snz_sex_gender_code and h.snz_uid <> a.snz_uid then h.snz_uid  else -1 end) as assoc_uid_2
,max(case when dob_3 = d.dob and sex_3 = d.snz_sex_gender_code and c.snz_associated_uid <> a.snz_uid then c.snz_associated_uid 
	 when dob_3 = f.dob and sex_3 = f.snz_sex_gender_code and f.snz_uid <> a.snz_uid then f.snz_uid 
	 when dob_3 = h.dob and sex_3 = h.snz_sex_gender_code and h.snz_uid <> a.snz_uid then h.snz_uid  else -1 end) as assoc_uid_3
,max(case when dob_4 = d.dob and sex_4 = d.snz_sex_gender_code and c.snz_associated_uid <> a.snz_uid then c.snz_associated_uid 
	 when dob_4 = f.dob and sex_4 = f.snz_sex_gender_code and f.snz_uid <> a.snz_uid then f.snz_uid 
	 when dob_4 = h.dob and sex_4 = h.snz_sex_gender_code and h.snz_uid <> a.snz_uid then h.snz_uid  else -1 end) as assoc_uid_4
into #hh_dups9
from #hh_dups8 a
left join (select snz_uid,datefromparts(snz_birth_year_nbr, snz_birth_month_nbr, 15) as dob,snz_sex_gender_code from [$(idicleanversion)].[data].[personal_detail]) b on a.snz_uid = b.snz_uid 
left join (select distinct snz_uid,snz_associated_uid  from [$(idicleanversion)].[data].[person_relationship]) c on a.snz_uid = c.snz_uid 
left join (select snz_uid, datefromparts(snz_birth_year_nbr, snz_birth_month_nbr, 15) as dob,snz_sex_gender_code,snz_spine_ind from [$(idicleanversion)].[data].[personal_detail]) d on c.snz_associated_uid = d.snz_uid and d.snz_spine_ind = 1
left join [$(idicleanversion)].[data].[personal_detail] e on a.snz_uid = e.snz_parent1_uid
left join (select snz_uid, datefromparts(snz_birth_year_nbr, snz_birth_month_nbr, 15) as dob,snz_sex_gender_code,snz_spine_ind from [$(idicleanversion)].[data].[personal_detail]) f on e.snz_parent2_uid = f.snz_uid and f.snz_spine_ind = 1
left join [$(idicleanversion)].[data].[personal_detail] g on a.snz_uid = g.snz_parent2_uid
left join (select snz_uid, datefromparts(snz_birth_year_nbr, snz_birth_month_nbr, 15) as dob,snz_sex_gender_code,snz_spine_ind from [$(idicleanversion)].[data].[personal_detail]) h on g.snz_parent1_uid = h.snz_uid and h.snz_spine_ind = 1
group by a.snz_uid,dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4;

/* Now put in one big long table, first creating logic with what to use as the ID */
drop table if exists #hh_dups9a;
select 
*
,case when match_1 = 2  then 1 
	  when assoc_uid_1 <> -1 then 2
	  when (match_2 <= 1 and match_3 <= 1 and match_4 <= 1) then 1 else 3 end as criteria_1
,case when match_2 = 2 then 1 
      when assoc_uid_2 <> -1 then 2
	  when (match_1 <= 1 and match_3 <= 1 and match_4 <= 1 and assoc_uid_1 <> -1 ) then 1
	  when Match_1 = 2 or (match_3 <= 1 and match_4 <= 1) then 3 else 4 end as criteria_2
,case when match_3 = 2 then 1 
	  when assoc_uid_3 <> -1  then 2
	  when Match_1 = 0 or Match_2 = 0 or assoc_uid_1 <> -1  or assoc_uid_2 <> -1  then 3 else 4 end as criteria_3
,case when match_4 = 2 then 1 
	  when assoc_uid_4 <> -1  then 2 else 4 end as criteria_4
into #hh_dups9a
from #hh_dups9;

drop table if exists #hh_dups10;
select 
snz_uid
,dob
,sex
,new_snz_uid
into #hh_dups10
from #hh_dups7
union
select 
snz_uid,dob_1 as dob
,case when sex_1 = 1 then 'M' when sex_1 = 2 then 'F' else 'X' end as sex
,case when criteria_1 = 1 then snz_uid when criteria_1 = 2 then assoc_uid_1 when criteria_1 = 3 then -snz_uid else -snz_uid - 10000000 end as new_snz_uid
from #hh_dups9a
where Match_1 > 0
union
select 
snz_uid,dob_2 as dob
,case when sex_2 = 1 then 'M' when sex_2 = 2 then 'F' else 'X' end as sex
,case when criteria_2 = 1 then snz_uid when criteria_2 = 2 then assoc_uid_2 when criteria_2 = 3 then -snz_uid else -snz_uid - 10000000 end as new_snz_uid
from #hh_dups9a
where Match_2 > 0
union
select 
snz_uid,dob_3 as dob
,case when sex_3 = 1 then 'M' when sex_3 = 2 then 'F' else 'X' end as sex
,case when criteria_3 = 1 then snz_uid when criteria_3 = 2 then assoc_uid_1 when criteria_3 = 3 then -snz_uid else -snz_uid - 10000000 end as new_snz_uid
from #hh_dups9a
where Match_3 > 0
union
select 
snz_uid,dob_4 as dob
,case when sex_4 = 1 then 'M' when sex_4 = 2 then 'F' else 'X' end as sex
,case when criteria_4 = 1 then snz_uid when criteria_4 = 2 then assoc_uid_4 when criteria_4 = 3 then -snz_uid else -snz_uid - 10000000 end as new_snz_uid
from #hh_dups9a
where Match_4 > 0;

drop table if exists #hh_dups10a;
select 
snz_uid
,dob
,sex
,iif(sex = 'X' and new_snz_uid = -snz_uid, snz_uid, new_snz_uid) as new_snz_uid
into #hh_dups10a
from #hh_dups10;

drop table if exists #dups;
select 
a.*
,count_date 
into #dups
from #hh5 a
inner join (select 
            snz_uid
			,snapshot_date
			,count(*) as count_date 
            from (select 
				  snz_uid
				  ,household_id
				  ,snapshot_date 
				  from #hh5 
				  group by snz_uid, household_id, snapshot_date
				  ) c 
            group by snz_uid, snapshot_date having count(*) > 1
			) b on a.snz_uid = b.snz_uid and a.snapshot_date = b.snapshot_date;

drop table if exists #dups1prep;
select 
snz_uid
,snapshot_date
,case when rownum = 1 then dob else '1900-01-01' end as dob_1
,case when rownum = 1 and sex='M' then 1 when rownum=1 and sex='F' then 2 else 0 end as sex_1
,case when rownum = 2 then dob else '1900-01-01' end as dob_2
,case when rownum = 2 and sex='M' then 1 when rownum=2 and sex='F' then 2 else 0 end as sex_2
,case when rownum = 3 then dob else '1900-01-01' end as dob_3
,case when rownum = 3 and sex='M' then 1 when rownum=3 and sex='F' then 2 else 0 end as sex_3
,case when rownum = 4 then dob else '1900-01-01' end as dob_4
,case when rownum = 4 and sex='M' then 1 when rownum=4 and sex='F' then 2 else 0 end as sex_4
into #dups1prep
from (select 
      *
	  ,row_number() over (partition by snz_uid, snapshot_date order by snz_uid, snapshot_date,dob,sex) as rownum 
	  from #dups 
	  where snz_uid <> -1
	 ) a
;

drop table if exists #dups1;
select 
snz_uid
,snapshot_date
,count(*) as [count]
,max(isnull(dob_1,'1900-01-01')) as dob_1
,sum(sex_1) as sex_1
,max(isnull(dob_2,'1900-01-01')) as dob_2
,sum(sex_2) as sex_2
,max(isnull(dob_3,'1900-01-01')) as dob_3
,sum(sex_3) as sex_3
,max(isnull(dob_4,'1900-01-01')) as dob_4
,sum(sex_4) as sex_4
into #dups1
from #dups1prep
group by snz_uid,snapshot_date;

drop table if exists #dups2;
select 
snz_uid
,[count]
,dob_1
,sex_1
,dob_2
,sex_2
,dob_3
,sex_3
,dob_4
,sex_4
,count(*) as num_dates
into #dups2
from #dups1
group by snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4
order by snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4;

/* Merge on ird numbers from swn table */
drop table if exists #dups3;
select 
a.*
,max(isnull(b.msd_swn_snz_unique_nbr,-1)) as msd_swn_num
,max(case when a.dob_1 = b.dob and a.sex_1 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_1
,max(case when a.dob_2 = b.dob and a.sex_2 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_2
,max(case when a.dob_3 = b.dob and a.sex_3 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_3
,max(case when a.dob_4 = b.dob and a.sex_4 = b.sex then snz_ird_uid else -1 end) as snz_ird_uid_4
into #dups3
from #dups2 a
left join (select 
           snz_uid
		   ,msd_swn_snz_unique_nbr
		   ,datefromparts(msd_swn_birth_year_nbr, msd_swn_birth_month_nbr, 15) as dob
		   ,msd_swn_gender_snz_code as sex
		   ,snz_ird_uid
			from [$(idicleanversion)].[msd_clean].[msd_swn] 
			) b on a.snz_uid = b.snz_uid
group by a.snz_uid,[count],dob_1,sex_1,dob_2,sex_2,dob_3,sex_3,dob_4,sex_4,num_dates;

/* Calculate unique IRD numbers and unique number of birthdays per person */
drop table if exists #dups4;
select 
*
,case when snz_ird_uid_1 <> -1 and snz_ird_uid_1 > 0 then 1 else 0 end as ird_uni_1
,case when snz_ird_uid_2 <> -1 and snz_ird_uid_2 > 0 and (snz_ird_uid_2 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) then 1 else 0 end as ird_uni_2
,case when snz_ird_uid_3 <> -1 and snz_ird_uid_3 > 0 and ((snz_ird_uid_3 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) and (snz_ird_uid_3 <> snz_ird_uid_2 or snz_ird_uid_2 = -1)) then 1 else 0 end as ird_uni_3
,case when snz_ird_uid_4 <> -1 and snz_ird_uid_4 > 0 and ((snz_ird_uid_4 <> snz_ird_uid_1 or snz_ird_uid_1 = -1) and (snz_ird_uid_4 <> snz_ird_uid_2 or snz_ird_uid_2 = -1) and (snz_ird_uid_4 <> snz_ird_uid_3 or snz_ird_uid_3 = -1)) then 1 else 0 end as ird_uni_4	
,case when dob_1 is not null and dob_1 <> '1900-01-01'  then 1 else 0 end as unique_bday1
,case when dob_2 is not null and dob_2 <> '1900-01-01' and (dob_2 <> dob_1 or sex_2 <> sex_1) then 1 else 0 end as unique_bday2
,case when dob_3 is not null and dob_3 <> '1900-01-01' and ((dob_3 <> dob_1 or sex_3 <> sex_1) and (dob_3 <> dob_2 or sex_3 <> sex_2)) then 1 else 0 end as unique_bday3
,case when dob_4 is not null and dob_4 <> '1900-01-01' and ((dob_4 <> dob_1 or sex_4 <> sex_1) and (dob_4 <> dob_2 or sex_4 <> sex_2) and (dob_4 <> dob_3 or sex_4 <> sex_3)) then 1 else 0 end as unique_bday4
into #dups4
from #dups3;

/* Find those with multiple ird number entries */
drop table if exists #dups5;
select *
into #dups5
from #dups4
where ird_uni_1+ird_uni_2+ird_uni_3+ird_uni_4 > 1;

/* Convert to long dataset */
drop table if exists #dups6;
select snz_uid,dob_1 as dob, case when sex_1 = 1 then 'M' when sex_1 = 2 then 'F' else 'X' end as sex,snz_ird_uid_1 as snz_ird_uid
into #dups6
from #dups5
where ird_uni_1 = 1
union
select snz_uid,dob_2 as dob, case when sex_2 = 1 then 'M' when sex_2 = 2 then 'F' else 'X' end as sex,snz_ird_uid_2 as snz_ird_uid
from #dups5
where ird_uni_2 = 1
union
select snz_uid,dob_3 as dob, case when sex_3 = 1 then 'M' when sex_3 = 2 then 'F' else 'X' end as sex,snz_ird_uid_3 as snz_ird_uid
from #dups5
where ird_uni_3 = 1
union
select snz_uid,dob_4 as dob, case when sex_4 = 1 then 'M' when sex_4 = 2 then 'F' else 'X' end as sex,snz_ird_uid_4 as snz_ird_uid
from #dups5
where ird_uni_4 = 1;

/* Add on snz_uid from ird number */
drop table if exists #dups7;
select 
a.*
,b.snz_uid as new_snz_uid
into #dups7
from #dups6 a
left join [$(idicleanversion)].[security].[concordance] b on a.snz_ird_uid = b.snz_ird_uid;

/* Combine with other table */
drop table if exists #duplicate_snz_ids;
select 
snz_uid
,dob
,sex
,new_snz_uid
into #duplicate_snz_ids
from #hh_dups10a
union
select snz_uid,dob,sex,new_snz_uid
from #dups7;

drop table if exists #duplicate_snz_ids1;
select 
a.snz_uid
,max(iif(row_num = 1,dob,'1900-01-01')) as dob_1
,max(iif(row_num = 2,dob,'1900-01-01')) as dob_2
,max(iif(row_num = 3,dob,'1900-01-01')) as dob_3
,max(iif(row_num = 4,dob,'1900-01-01')) as dob_4
,max(iif(row_num = 5,dob,'1900-01-01')) as dob_5
,max(iif(row_num = 6,dob,'1900-01-01')) as dob_6
,max(iif(row_num = 1,sex,'')) as sex_1
,max(iif(row_num = 2,sex,'')) as sex_2
,max(iif(row_num = 3,sex,'')) as sex_3
,max(iif(row_num = 4,sex,'')) as sex_4
,max(iif(row_num = 5,sex,'')) as sex_5
,max(iif(row_num = 6,sex,'')) as sex_6
,max(iif(row_num = 1,new_snz_uid,-1)) as new_id_1
,max(iif(row_num = 2,new_snz_uid,-1)) as new_id_2
,max(iif(row_num = 3,new_snz_uid,-1)) as new_id_3
,max(iif(row_num = 4,new_snz_uid,-1)) as new_id_4
,max(iif(row_num = 5,new_snz_uid,-1)) as new_id_5
,max(iif(row_num = 6,new_snz_uid,-1)) as new_id_6
into #duplicate_snz_ids1
from (select distinct 
      snz_uid 
	  from #duplicate_snz_ids
	  ) a
left join (select 
           *
		   ,row_number() over (partition by snz_uid order by snz_uid,new_snz_uid desc,dob,sex) as row_num 
		   from #duplicate_snz_ids
		  ) b on a.snz_uid = b.snz_uid
group by a.snz_uid;

drop table if exists #duplicate_snz_ids1a;
select 
snz_uid
,case when dob_1 = '1900-01-01' then null else dob_1 end as dob_1
,case when dob_2 = '1900-01-01' then null else dob_2 end as dob_2
,case when dob_3 = '1900-01-01' then null else dob_3 end as dob_3
,case when dob_4 = '1900-01-01' then null else dob_4 end as dob_4
,case when dob_5 = '1900-01-01' then null else dob_5 end as dob_5
,case when dob_6 = '1900-01-01' then null else dob_6 end as dob_6
,case when sex_1 = '' then null else sex_1 end as sex_1
,case when sex_2 = '' then null else sex_2 end as sex_2
,case when sex_3 = '' then null else sex_3 end as sex_3
,case when sex_4 = '' then null else sex_4 end as sex_4
,case when sex_5 = '' then null else sex_5 end as sex_5
,case when sex_6 = '' then null else sex_6 end as sex_6
,case when new_id_1 = -1 then null else new_id_1 end as new_id_1
,case when new_id_2 = -1 then null else new_id_2 end as new_id_2
,case when new_id_3 = -1 then null else new_id_3 end as new_id_3
,case when new_id_4 = -1 then null else new_id_4 end as new_id_4
,case when new_id_5 = -1 then null else new_id_5 end as new_id_5
,case when new_id_6 = -1 then null else new_id_6 end as new_id_6
into #duplicate_snz_ids1a
from #duplicate_snz_ids1;

/* Apply some adjustments based on:
1) tidy up where the columns have same dob and same sex (due to 2 different approaches added together)
2) if dob the same and gender different and no evidence of another id then stick with original ■■■■) if gender the same and dob out by 1 month or 1 year then stick with original id */
drop table if exists #duplicate_snz_ids2;
select 
*
,iif(dob_1 = dob_2 and sex_1 = sex_2,1,0) as match_1_2
,iif(dob_1 = dob_3 and sex_1 = sex_3,1,0) as match_1_3
,iif(dob_1 = dob_4 and sex_1 = sex_4,1,0) as match_1_4
,iif(dob_1 = dob_5 and sex_1 = sex_5,1,0) as match_1_5
,iif(dob_1 = dob_6 and sex_1 = sex_6,1,0) as match_1_6
,iif(dob_2 = dob_3 and sex_2 = sex_3,1,0) as match_2_3
,iif(dob_2 = dob_4 and sex_2 = sex_4,1,0) as match_2_4
,iif(dob_2 = dob_5 and sex_2 = sex_5,1,0) as match_2_5
,iif(dob_2 = dob_6 and sex_2 = sex_6,1,0) as match_2_6
,iif(dob_3 = dob_4 and sex_3 = sex_4,1,0) as match_3_4
,iif(dob_3 = dob_5 and sex_3 = sex_5,1,0) as match_3_5
,iif(dob_3 = dob_6 and sex_3 = sex_6,1,0) as match_3_6
,iif(dob_4 = dob_5 and sex_4 = sex_5,1,0) as match_4_5
,iif(dob_4 = dob_6 and sex_4 = sex_6,1,0) as match_4_6
,iif(dob_5 = dob_6 and sex_5 = sex_6,1,0) as match_5_6
into #duplicate_snz_ids2
from #duplicate_snz_ids1a;

drop table if exists #duplicate_snz_ids22;
select 
snz_uid
,dob_1
,dob_2
,sex_1
,sex_2
,new_id_1
,case 
	when dob_1 = dob_2 and new_id_1 = -new_id_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(MONTH,1,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(MONTH,-1,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,1,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,-1,dob_1) = dob_2 then new_id_1
	when new_id_1 != -new_id_2 then new_id_2
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(MONTH,2,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(MONTH,-2,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,2,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,-2,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,10,dob_1) = dob_2 then new_id_1
	when sex_1 = sex_2 and new_id_1 = -new_id_2 and dateadd(YEAR,-10,dob_1) = dob_2 then new_id_1
    else -new_id_1 end as new_id_2
into #duplicate_snz_ids22
from #duplicate_snz_ids2
where dob_3 is null;

drop table if exists #duplicate_snz_ids23a;
select 
snz_uid
,dob_1
,dob_2
,dob_3
,sex_1
,sex_2
,sex_3
,new_id_1
,case when dob_1 = dob_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(MONTH,1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(MONTH,-1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,-1,dob_1) = dob_2 then new_id_1
		when new_id_2 > 0 and new_id_1 != new_id_2 then new_id_2
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(MONTH,2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(MONTH,-2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,-2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,10,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) and dateadd(YEAR,-10,dob_1) = dob_2 then new_id_1
		else -new_id_1 end as new_id_2
,case when dob_1 = dob_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(MONTH,1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(MONTH,-1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,-1,dob_1) = dob_3 then new_id_1
		when new_id_3 > 0 and new_id_1 != new_id_3 then new_id_3
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(MONTH,2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(MONTH,-2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,-2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,10,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) and dateadd(YEAR,-10,dob_1) = dob_3 then new_id_1
		end as new_id_3
into #duplicate_snz_ids23a
from #duplicate_snz_ids2 a
where dob_4 is null and dob_3 is not null;

drop table if exists #duplicate_snz_ids23;
select 
snz_uid
,dob_1
,dob_2
,dob_3
,sex_1
,sex_2
,sex_3
,new_id_1
,case when new_id_2 != -new_id_1 then new_id_2
		when new_id_3 is null then new_id_2
		when dob_2 = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(MONTH,1,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(MONTH,-1,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,1,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,-1,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(MONTH,2,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(MONTH,-2,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,2,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,-2,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,10,dob_2) = dob_3 then new_id_3
		when sex_2 = sex_3 and dateadd(YEAR,-10,dob_2) = dob_3 then new_id_3
		else new_id_2 end as new_id_2
,case when new_id_3 is not null then new_id_3
		when dob_2 = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,-1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,-2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,10,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-10,dob_2) = dob_3 then new_id_2
		when new_id_2 < 0 then -snz_uid-10000000
		else -snz_uid end as new_id_3
into #duplicate_snz_ids23
from #duplicate_snz_ids23a;

drop table if exists #duplicate_snz_ids41;
select 
snz_uid
,dob_1
,dob_2
,dob_3
,dob_4
,sex_1
,sex_2
,sex_3
,sex_4
,new_id_1
,case when dob_1 = dob_2 and (new_id_1 = new_id_2 or new_id_1 = -new_id_2) then new_id_1
		when new_id_2 > 0 and new_id_1 != new_id_2 then new_id_2
		when sex_1 = sex_2 and dateadd(MONTH,1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(MONTH,-1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,-1,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(MONTH,2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(MONTH,-2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,-2,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,10,dob_1) = dob_2 then new_id_1
		when sex_1 = sex_2 and dateadd(YEAR,-10,dob_1) = dob_2 then new_id_1
		else -new_id_1 end as new_id_2
,case when dob_1 = dob_3 and (new_id_1 = new_id_3 or new_id_1 = -new_id_3) then new_id_1
		when new_id_3 > 0 and new_id_1 != new_id_3 then new_id_3
		when sex_1 = sex_3 and dateadd(MONTH,1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(MONTH,-1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,-1,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(MONTH,2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(MONTH,-2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,-2,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,10,dob_1) = dob_3 then new_id_1
		when sex_1 = sex_3 and dateadd(YEAR,-10,dob_1) = dob_3 then new_id_1
		end as new_id_3
,case when dob_1 = dob_4 and (new_id_1 = new_id_4 or new_id_1 = -new_id_4) then new_id_1
		when new_id_4 > 0 and new_id_1 != new_id_4 then new_id_4
		when sex_1 = sex_4 and dateadd(MONTH,1,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(MONTH,-1,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,1,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,-1,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(MONTH,2,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(MONTH,-2,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,2,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,-2,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,10,dob_1) = dob_4 then new_id_1
		when sex_1 = sex_4 and dateadd(YEAR,-10,dob_1) = dob_4 then new_id_1
		end as new_id_4
into #duplicate_snz_ids41
from #duplicate_snz_ids2 a
where dob_4 is not null and dob_5 is null;

drop table if exists #duplicate_snz_ids42;
select 
snz_uid
,dob_1
,dob_2
,dob_3
,dob_4
,sex_1
,sex_2
,sex_3
,sex_4
,new_id_1
,new_id_2
,case when new_id_3 is not null then new_id_3
	    when dob_2 = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,-1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-1,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(MONTH,-2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-2,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,10,dob_2) = dob_3 then new_id_2
		when sex_2 = sex_3 and dateadd(YEAR,-10,dob_2) = dob_3 then new_id_2
		when new_id_2 < 0 and new_id_1 = snz_uid then new_id_2 - 10000000
		else -new_id_2 end as new_id_3
,case when new_id_4 is not null then new_id_4
		when dob_2 = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(MONTH,1,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(MONTH,-1,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,1,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,-1,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(MONTH,2,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(MONTH,-2,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,2,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,-2,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,10,dob_2) = dob_4 then new_id_2
		when sex_2 = sex_4 and dateadd(YEAR,-10,dob_2) = dob_4 then new_id_2
		end as new_id_4
into #duplicate_snz_ids42
from #duplicate_snz_ids41 a;

drop table if exists #duplicate_snz_ids43;
select 
snz_uid
,dob_1
,dob_2
,dob_3
,dob_4
,sex_1
,sex_2
,sex_3
,sex_4
,new_id_1
,new_id_2,new_id_3
,case when new_id_4 is not null then new_id_4
	    when dob_3 = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(MONTH,1,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(MONTH,-1,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,1,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,-1,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(MONTH,2,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(MONTH,-2,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,2,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,-2,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,10,dob_3) = dob_4 then new_id_3
		when sex_3 = sex_4 and dateadd(YEAR,-10,dob_3) = dob_4 then new_id_3
		when new_id_3 < 0 and new_id_1 = snz_uid then new_id_3 - 10000000
		else -new_id_3 end as new_id_4
into #duplicate_snz_ids43
from #duplicate_snz_ids42;

/* Combine them all together */
drop table if exists #duplicate_snz_ids3;
select snz_uid,dob_1,dob_2,null as dob_3,null as dob_4,null as dob_5,null as dob_6,sex_1,sex_2,null as sex_3,null as sex_4,null as sex_5,null as sex_6,new_id_1,new_id_2,null as new_id_3,null as new_id_4,null as new_id_5,null as new_id_6
into #duplicate_snz_ids3
from #duplicate_snz_ids22
union
select snz_uid,dob_1,dob_2,dob_3,null as dob_4,null as dob_5,null as dob_6,sex_1,sex_2,sex_3,null as sex_4,null as sex_5,null as sex_6,new_id_1,new_id_2,new_id_3,null as new_id_4,null as new_id_5,null as new_id_6
from #duplicate_snz_ids23
union
select snz_uid,dob_1,dob_2,dob_3,dob_4,null as dob_5,null as dob_6,sex_1,sex_2,sex_3,sex_4,null as sex_5,null as sex_6,new_id_1,new_id_2,new_id_3,new_id_4,null as new_id_5,null as new_id_6
from #duplicate_snz_ids43
union
select snz_uid,dob_1,dob_2,dob_3,dob_4,dob_5,dob_6,sex_1,sex_2,sex_3,sex_4,sex_5,sex_6,new_id_1,new_id_2,new_id_3,new_id_4,new_id_5,new_id_6
from #duplicate_snz_ids2
where dob_5 is not null;

/* Convert to long */
drop table if exists #duplicate_snz_ids4;
select snz_uid,dob_1 as dob, sex_1 as sex,new_id_1 as new_id,iif(snz_uid = new_id_1,1,0) as orig_id
into #duplicate_snz_ids4
from #duplicate_snz_ids3
union
select snz_uid,dob_2 as dob, sex_2 as sex,new_id_2 as new_id,iif(snz_uid = new_id_2,1,0) as orig_id
from #duplicate_snz_ids3
union
select snz_uid,dob_3 as dob, sex_3 as sex,new_id_3 as new_id,iif(snz_uid = new_id_3,1,0) as orig_id
from #duplicate_snz_ids3
union
select snz_uid,dob_4 as dob, sex_4 as sex,new_id_4 as new_id,iif(snz_uid = new_id_4,1,0) as orig_id
from #duplicate_snz_ids3
union
select snz_uid,dob_5 as dob, sex_5 as sex,new_id_5 as new_id,iif(snz_uid = new_id_5,1,0) as orig_id
from #duplicate_snz_ids3
union
select snz_uid,dob_6 as dob, sex_6 as sex,new_id_6 as new_id,iif(snz_uid = new_id_6,1,0) as orig_id
from #duplicate_snz_ids3;

drop table if exists #duplicate_snz_ids5;
select distinct 
*
,row_number() over (partition by snz_uid,dob,sex order by snz_uid,dob,sex,orig_id desc,new_id desc) as row_num
into #duplicate_snz_ids5
from #duplicate_snz_ids4
where new_id is not null;

/* Delete second row number where it's negative of first */
drop table if exists #duplicate_snz_ids6;
select 
a.*
into #duplicate_snz_ids6
from #duplicate_snz_ids5 a
left join #duplicate_snz_ids5 b on a.snz_uid = b.snz_uid and a.dob = b.dob and a.sex=b.sex and b.row_num = 1 and a.row_num = 2
where b.snz_uid is null or a.new_id > 0;

/* Now we want to give people new id, merge on row number so to not create extra people */
drop table if exists #hh5a;
select 
coalesce(b.new_id,a.snz_uid) as snz_uid 	
,a.household_id
,a.snapshot_date
,a.app_relship
,a.signatory_flag_ind
,a.sex
,a.dob
,a.raw_snz_uid
into #hh5a
from (select 
      *
	  ,row_number() over (partition by snz_uid,dob,sex order by snz_uid,dob,sex) as row_num 
	  from #hh5
	 ) a
left join #duplicate_snz_ids6 b on a.snz_uid = b.snz_uid and a.dob = b.dob and a.sex = b.sex and a.row_num = b.row_num;

/* Save table - duplicate snz_uid mean that we can not create a primary key */
drop table if exists #tenancy_hsehld_snap_clean_$(YYYYMM);
create table #tenancy_hsehld_snap_clean_$(YYYYMM) (
snz_uid int not null,
household_id int null,
link_set_key smallint not null,
snapshot_date date not null,
app_relship varchar(50) null,
signatory_flag_ind varchar(1) null,
sex varchar(1) null,
dob date null,
raw_snz_uid int null
);

set @link_set = (select top 1 link_set_key from [$(idicleanversion)].[data].[personal_detail]);
insert into #tenancy_hsehld_snap_clean_$(YYYYMM)

select
snz_uid,	 
household_id,
@link_set as link_set_key,
snapshot_date,
app_relship,
signatory_flag_ind,
sex,
dob,
raw_snz_uid
from #hh5a;	

/* De-duplicate if anyone consistent with the household table that is de-duped and is primary */
drop table if exists #hnz_dedup_ids;
select 
b.new_id as new_snz_uid
,a.snz_uid	
,a.household_id
,a.snapshot_date
,a.app_relship
,a.signatory_flag_ind
,a.sex
,a.dob
,a.raw_snz_uid
into #hnz_dedup_ids
from (select 
      *
	  ,row_number() over (partition by snz_uid,dob,sex order by snz_uid,dob,sex) as row_num 
	  from #hh5
	  ) a
inner join #duplicate_snz_ids6 b on a.snz_uid = b.snz_uid and a.dob = b.dob and a.sex = b.sex and a.row_num = b.row_num;

drop table if exists #temporary;
select 
coalesce(b.new_snz_uid,a.snz_uid) as snz_uid,
a.household_id,
a.house_id,
a.snapshot_date,
a.tenancy_start_date,
a.social_house_entry_date,
a.hshd_size,
a.hshd_type,
a.hshd_weekly_income,
a.inc_related_rent,
a.inc_reltd_rent_subsdy,
a.market_rent,
a.bedroom_required_count,
a.provider_name,
a.snz_idi_address_register_uid,
a.meshblock_code,
a.adj_ind,
a.exit_date,
a.exit_type
into #temporary
from #tenancy_snapshot_clean_$(YYYYMM) a
left join (select 
           snz_uid
		   ,household_id
		   ,snapshot_date
		   ,max(new_snz_uid) as new_snz_uid 
		   from #hnz_dedup_ids 
           where app_relship in ('PRIMARY','SIGNATORY','APPLICANT')
           group by snz_uid,household_id,snapshot_date
		  ) b on a.snz_uid = b.snz_uid and a.snapshot_date = b.snapshot_date and a.household_id = b.household_id and b.new_snz_uid > 0; 

/* Duplicate snz_uid mean that we can not create a primary key */
drop table if exists #tenancy_snpshot_clean_$(YYYYMM);
create table #tenancy_snpshot_clean_$(YYYYMM) (
snz_uid int null,
household_id int not null,
house_id int null,
link_set_key smallint not null,
snapshot_date date not null,
tenancy_start_date date null,
social_house_entry_date date null,
hshd_size smallint null,
hshd_type varchar(100) null,
hshd_weekly_income float null,
inc_related_rent float null,
inc_reltd_rent_subsdy float null,
market_rent float null,
bedroom_required_count int null,
provider_name varchar(5) null,
snz_idi_address_register_uid int null,
meshblock_code varchar(7) null,
adj_ind tinyint not null,
exit_date date null,
exit_type varchar(8) null,
constraint PK_ts_clean_$(YYYYMM) primary key clustered (link_set_key asc, household_id asc, snapshot_date asc)
);

set @link_set = (select top 1 link_set_key from [$(idicleanversion)].[data].[personal_detail])
insert into #tenancy_snpshot_clean_$(YYYYMM)

select 
snz_uid,	
household_id,
house_id,
@link_set as link_set_key,
snapshot_date,
tenancy_start_date,
social_house_entry_date,
hshd_size,
hshd_type,
hshd_weekly_income,
inc_related_rent,
inc_reltd_rent_subsdy,
market_rent,
bedroom_required_count,
provider_name,
snz_idi_address_register_uid,
meshblock_code,
adj_ind,
exit_date,
exit_type
from #temporary;

/* Create a clean houses_snapshot dataset and filling in gaps */
drop table if exists #houses_snapshot;
select distinct 
coalesce(b.house_id,c.house_id) as house_id
,coalesce(d.household_id,e.household_id) as household_id
,hnz_hs_snapshot_date as snapshot_date
,a.snz_household_uid
,a.snz_legacy_household_uid
,a.hnz_hs_bedrooms_nbr as bedrooms
,a.hnz_hs_market_rent_nbr as market_rent
,a.hnz_hs_rent_date as rent_date
,a.hnz_hs_lease_status_text as lease_status
,a.hnz_hs_lease_expiry_date as lease_expiry_date
,a.hnz_hs_occupancy_status_text as occupancy_status
,a.snz_idi_address_register_uid 
,a.hnz_hs_meshblock_id as hnz_meshblock
,a.hnz_hs_meshblock_code as snz_meshblock
into #houses_snapshot
from [$(idicleanversion)].[hnz_clean].[houses_snapshot] a
left join #tenancy_house_id_$(YYYYMM) b on a.snz_house_uid = b.snz_house_uid and b.snz_house_uid is not null
left join #tenancy_house_id_$(YYYYMM) c on a.snz_legacy_house_uid = c.snz_legacy_house_uid and c.snz_legacy_house_uid is not null
left join #tenancy_household_id_$(YYYYMM) d on a.snz_household_uid = d.snz_household_uid and d.snz_household_uid is not null
left join #tenancy_household_id_$(YYYYMM) e on a.snz_legacy_household_uid = e.snz_legacy_household_uid and e.snz_legacy_household_uid is not null;

/* There are probably lots of dups created due to the snz_household_uid field (i.e. different households in same month) */
drop table if exists #dup_houses;
select 
house_id
,snapshot_date
,count(*) as n
into #dup_houses
from #houses_snapshot
group by house_id,snapshot_date
having count(*) > 1
order by count(*) desc;

/* The duplicates have been caused by hnz giving a new house id to same hh and the old hh id remains empty, only take the version where we know someone is in the house */
drop table if exists #houses_snapshota1;
select 
a.*
into #houses_snapshota1
from #houses_snapshot a
left join #dup_houses b on a.house_id = b.house_id and a.snapshot_date = b.snapshot_date
where b.house_id is NULL or a.occupancy_status in ('PENDING-OCCUPIED','AVAILABLE-OCCUPIED');

drop table if exists #remaining_dups;
select distinct 
house_id
into #remaining_dups
from (select 
      house_id
	  ,snapshot_date
	  ,count(*) as n 
	  from #houses_snapshota1
	  group by house_id, snapshot_date
	  having count(*)>1
	 ) as a;

drop table if exists #houses_snapshota;
select 
a.*
into #houses_snapshota
from #houses_snapshota1 a
left join #remaining_dups b on a.house_id = b.house_id
where b.house_id is null;

/* There are some duplicate entries in the address table with different meshblock codes so take the most recent meshblock code */
drop table if exists #address_notification;
select 
a.snz_idi_address_register_uid
,a.ant_notification_date
,a.ant_post_code
,a.ant_meshblock_code
into #address_notification
from [$(idicleanversion)].[data].[address_notification] a
inner join (select distinct 
            snz_idi_address_register_uid 
			from #houses_snapshota
		    ) b on a.snz_idi_address_register_uid = b.snz_idi_address_register_uid;

drop table if exists #address_notification1;
select 
a.*
into #address_notification1
from (select 
      *
	  ,row_number() over (partition by snz_idi_address_register_uid order by ant_notification_date desc) as row_id 
	  from #address_notification
	 ) a
where row_id = 1;

/* Add on location variables that we need to tidy up meshblock entries */
drop table if exists #houses_snapshot1;
select 
a.*
,coalesce(c.ant_post_code,e.postcode,f.postcode) as postcode
,case when c.ant_post_code is not null then d.mb2013_code
	  when e.postcode is not null then b.mb2013_code
	  when f.postcode is not null then hnz_meshblock 
	  end as meshblock_2013
into #houses_snapshot1
from #houses_snapshota a
left join (select 
	       convert(real,meshblock_code) as meshblock_code
		   ,convert(real,mb2013_code) as mb2013_code
		   from [IDI_Metadata_$(YYYYMM)].[data].[meshblock_concordance]
		   ) b on convert(real,a.snz_meshblock) = b.meshblock_code
left join #address_notification1 c on a.snz_idi_address_register_uid = c.snz_idi_address_register_uid
left join (select 
           convert(real,meshblock_code) as meshblock_code
		   ,convert(real,mb2013_code) as mb2013_code 
	       from [IDI_Metadata_$(YYYYMM)].[data].[meshblock_concordance]
		   ) d on c.ant_meshblock_code = d.meshblock_code
left join (select 
           convert(real,mb2013_v1_00) as mb2013_v1_00
		   ,min(postcode) as postcode
	       from [IDI_Metadata].[clean_read_CLASSIFICATIONS].[hnz_meshblock_mb13postcode]
	       group by mb2013_v1_00
		   ) e on b.mb2013_code = e.mb2013_v1_00
left join (select 
           convert(real,mb2013_v1_00) as mb2013_v1_00
		   ,min(postcode) as postcode
	       from [IDI_Metadata].[clean_read_CLASSIFICATIONS].[hnz_meshblock_mb13postcode]
	       group by mb2013_v1_00
		   ) f on convert(real,a.hnz_meshblock) = f.mb2013_v1_00;

/* Look at mapping between hnz_meshblock and meshblock_2013 */
drop table if exists #mb_map;
select 
a.*
,row_number() over (partition by hnz_meshblock order by hnz_meshblock,count_mb desc) as row_id
into #mb_map
from (select 
      hnz_meshblock
	  ,meshblock_2013
	  ,min(postcode) as postcode
	  ,count(*) as count_mb
      from #houses_snapshot1 
      where hnz_meshblock is not null and meshblock_2013 is not null and convert(real,hnz_meshblock) > 0
      group by hnz_meshblock,meshblock_2013
	 ) a
order by hnz_meshblock;

/* For those with null mb then use the most common mapping */
drop table if exists #houses_snapshot2;
select 
a.house_id
,a.household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,a.snz_idi_address_register_uid 
,a.hnz_meshblock
,a.snz_meshblock
,coalesce(a.postcode,b.postcode) as postcode
,coalesce(a.meshblock_2013,b.meshblock_2013) as meshblock_2013
into #houses_snapshot2
from #houses_snapshot1 a
left join (select 
           * 
		   from #mb_map 
		   where row_id = 1
		   ) b on a.hnz_meshblock = b.hnz_meshblock and b.hnz_meshblock is not null;

/* A number of houses have their address change, backfill */
drop table if exists #house_address;
select 
house_id
,max(case when meshblock_2013 is not null then snapshot_date else '1900-01-01' end) as max_mb_date
,max(case when snz_idi_address_register_uid is not null then snapshot_date  else '1900-01-01' end) as max_address_date
into #house_address
from #houses_snapshot2
group by house_id;

drop table if exists #houses_snapshot3;
select 
a.house_id
,a.household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,d.snz_idi_address_register_uid 
,a.hnz_meshblock
,a.snz_meshblock
,c.meshblock_2013
,c.postcode
into #houses_snapshot3
from #houses_snapshot2 a
left join #house_address b on a.house_id = b.house_id
left join #houses_snapshot1 c on c.house_id = a.house_id and c.snapshot_date = b.max_mb_date and b.max_mb_date <> '1900-01-01'
left join #houses_snapshot1 d on d.house_id = a.house_id and d.snapshot_date = b.max_address_date and b.max_address_date <> '1900-01-01';

/* Truncate housing id for where we still don't know the meshblock */
drop table if exists #houses_snapshot4;
select 
a.house_id
,a.household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,a.snz_idi_address_register_uid 
,a.hnz_meshblock
,a.snz_meshblock
,coalesce(a.postcode,b.postcode) as postcode
,coalesce(a.meshblock_2013,b.mb2013_v1_00) as meshblock_2013
into #houses_snapshot4
from #houses_snapshot3 a
left join (select 
           mb2013_v1_00_trunc
		   ,min(convert(real,mb2013_v1_00)) as mb2013_v1_00
		   ,min(postcode) as postcode
	       from (select 
		         convert(int,convert(real,mb2013_v1_00)/100) as mb2013_v1_00_trunc
				 ,* 
				 from [IDI_Metadata].[clean_read_CLASSIFICATIONS].[hnz_meshblock_mb13postcode]
				 ) c
	       group by mb2013_v1_00_trunc
		   ) b on convert(int,convert(real,a.hnz_meshblock)/100) = b.mb2013_v1_00_trunc;

/* Need to fill back again */
drop table if exists #house_address1;
select 
house_id
,max(case when meshblock_2013 is not null then snapshot_date else '1900-01-01' end) as max_mb_date
,max(case when snz_idi_address_register_uid is not null then snapshot_date else '1900-01-01' end) as max_address_date
into #house_address1
from #houses_snapshot4
group by house_id;

drop table if exists #houses_snapshot5;
select 
a.house_id
,a.household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,d.snz_idi_address_register_uid 
,a.hnz_meshblock
,a.snz_meshblock
,c.meshblock_2013
,c.postcode
into #houses_snapshot5
from #houses_snapshot4 a
left join #house_address1 b on a.house_id = b.house_id
left join #houses_snapshot4 c on c.house_id = a.house_id and c.snapshot_date = b.max_mb_date and b.max_mb_date <> '1900-01-01'
left join #houses_snapshot4 d on d.house_id = a.house_id and d.snapshot_date = b.max_address_date and b.max_address_date <> '1900-01-01';

/* Fill in missing housing_id for Apr-Jun 2016 */
drop table if exists #houses_fill1a;
select 
a.house_id
,iif(b.household_id = c.household_id,b.household_id,NULL) as household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,a.snz_idi_address_register_uid
,a.hnz_meshblock
,a.snz_meshblock
,a.meshblock_2013,
a.postcode
into #houses_fill1a
from #houses_snapshot5 a
left join #houses_snapshot5 b on a.house_id = b.house_id and b.snapshot_date = '2016-03-31'
left join #houses_snapshot5 c on a.house_id = c.house_id and c.snapshot_date = '2016-07-31'
where a.snapshot_date > '2016-03-31' and a.snapshot_date < '2016-07-31'; 
  
drop table if exists #houses_fill1aa;
select 
a.*
into #houses_fill1aa 
from #houses_fill1a a
where a.occupancy_status not in ('AVAILABLE-OCCUPIED','UNAVAILABLE-OCCUPIED','UNDER DISPOSAL-OCCUPIED','PENDING-','AVAILABLE-','UNAVAILABLE-OCCUPIED') or a.household_id is not null;

/* Now match on from tenancy dataset */
drop table if exists #houses_fill1b;
select 
a.house_id
,coalesce(b.household_id,a.household_id) as household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date,
a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,a.snz_idi_address_register_uid
,a.hnz_meshblock
,a.snz_meshblock
,a.meshblock_2013
,a.postcode 
into #houses_fill1b
from #houses_fill1a a
left join (select 
           snapshot_date
		   ,house_id
		   ,household_id
		   ,row_number() over (partition by snapshot_date,house_id order by snapshot_date,house_id,household_id) as row_num
		   from #tenancy_snpshot_clean_$(YYYYMM)
		   ) b on a.snapshot_date = b.snapshot_date and a.house_id = b.house_id and b.row_num = 1
where a.occupancy_status in ('AVAILABLE-OCCUPIED','UNAVAILABLE-OCCUPIED','UNDER DISPOSAL-OCCUPIED','PENDING-','AVAILABLE-','UNAVAILABLE-OCCUPIED') and a.household_id is null;

drop table if exists #houses_filla;
select * into #houses_filla from #houses_fill1aa
union
select * from #houses_fill1b;

/* Populate some of the households in 2000 which are all missing - where we can - include row id due to duplicates */
drop table if exists #missing_hh_2000;
select 
a.house_id
,b.household_id as household_id
,a.snapshot_date
,a.bedrooms
,a.market_rent
,a.rent_date
,a.lease_status
,a.lease_expiry_date
,a.occupancy_status
,a.snz_idi_address_register_uid 
,a.hnz_meshblock
,a.snz_meshblock
,meshblock_2013
,a.postcode
into #missing_hh_2000
from #houses_snapshot5 a
left join (select 
           house_id
		   ,household_id
		   ,snapshot_date
		   ,tenancy_start_date
	       ,row_number() over (partition by house_id order by house_id,tenancy_start_date) as row_id
           from #tenancy_snpshot_clean_$(YYYYMM) 
		   where  snapshot_date = '2001-01-31'
		   ) b on a.house_id = b.house_id and b.tenancy_start_date < a.snapshot_date and row_id = 1
where year(a.snapshot_date) = 2000;

/* Combine all data into one final table for houses */
drop table if exists #houses_snapshot_clean_$(YYYYMM);
create table #houses_snapshot_clean_$(YYYYMM) (
house_id int not null,
household_id int null,
snapshot_date date not null,
bedrooms int null,
market_rent float null,
rent_date date null,
lease_status varchar(20) null ,
lease_expiry_date date null,
occupancy_status varchar(100) null,
snz_idi_address_register_uid int null,
hnz_meshblock varchar(50) null,
snz_meshblock char(7) null,
meshblock_2013 int null,
postcode int null
constraint PK_hs_clean_$(YYYYMM) primary key clustered (house_id asc, snapshot_date asc)
);

insert into #houses_snapshot_clean_$(YYYYMM)

select 
* 
from #houses_snapshot5
where year(snapshot_date) <> 2000 and snapshot_date not in ('2016-04-30','2016-05-31','2016-06-30') 
union
/* Missing 2000 data */
select * from #missing_hh_2000 where not (snz_meshblock!=meshblock_2013)
union
/* April-June 2016 fix */
select * from #houses_filla where not (snz_meshblock!=meshblock_2013);

/* Create clean file of CHP houses as these are not captured in HNZ data, the code attempts to match any CHP houses to when they were HNZ houses and find their bedrooms that way */
/* Find all chp houses */
drop table if exists #chps_houses;
select 
house_id
,household_id
,snz_idi_address_register_uid
,min(snapshot_date) as min_snapshot_date
,max(snapshot_date) as max_snapshot_date
into #chps_houses
from #tenancy_snpshot_clean_$(YYYYMM) 
where provider_name = 'CHP'
group by house_id,household_id,snz_idi_address_register_uid;

/* The vast majority of them we know the address, fill in addresses where we can */
drop table if exists #chps1;
select 
a.house_id
,a.household_id
,a.min_snapshot_date
,a.max_snapshot_date
,max(isnull(b.snz_idi_address_register_uid,-1)) as snz_idi_address_register_uid
into #chps1
from #chps_houses a
left join #chps_houses b on a.house_id = b.house_id
group by a.house_id,a.household_id,a.min_snapshot_date,a.max_snapshot_date;

/* Try to find bedrooms */
drop table if exists #chps1a;
select 
a.house_id
,min(a.min_snapshot_date) as min_snapshot_date
into #chps1a
from #chps1 a
group by a.house_id;

drop table if exists #chps2;
select 
a.house_id
,b.household_id
,a.min_snapshot_date
,b.max_snapshot_date
,case when b.snz_idi_address_register_uid = -1 then null else b.snz_idi_address_register_uid end as snz_idi_address_register_uid
into #chps2
from #chps1a a
left join #chps1 b on a.house_id = b.house_id and a.min_snapshot_date = b.min_snapshot_date;

/* Try and find household the month (or two months) before */
drop table if exists #chps2a;
select 
a.*
,coalesce(c.house_id,d.house_id) as old_house_id
,coalesce(c.snz_idi_address_register_uid,d.snz_idi_address_register_uid) as old_snz_idi_address_register_uid
into #chps2a
from #chps2 a
left join #tenancy_snpshot_clean_$(YYYYMM) b on a.household_id = b.household_id and a.min_snapshot_date = b.snapshot_date
left join #tenancy_snpshot_clean_$(YYYYMM) c on b.snz_uid = c.snz_uid and a.min_snapshot_date = eomonth(c.snapshot_Date,1) and c.provider_name = 'HNZ'
left join #tenancy_snpshot_clean_$(YYYYMM) d on b.snz_uid = d.snz_uid and a.min_snapshot_date = eomonth(d.snapshot_Date,2) and d.provider_name = 'HNZ';

/* Only use that info for where address matches */
drop table if exists #chps3;
select 
house_id
,household_id
,min_snapshot_date
,max_snapshot_date
,snz_idi_address_register_uid
,iif(old_snz_idi_address_register_uid = snz_idi_address_register_uid,old_house_id,null) as old_house_id
into #chps3
from #chps2a;

/* Find most recent time the address appeared */
drop table if exists #chps3a;
select 
a.house_id
,a.old_house_id
,a.snz_idi_address_register_uid
,max(isnull(b.snapshot_date,'1900-01-01')) as most_recent_address_date
into #chps3a
from #chps3 a
left join #tenancy_snpshot_clean_$(YYYYMM) b on a.snz_idi_address_register_uid = b.snz_idi_address_register_uid and b.snapshot_date < a.min_snapshot_date
group by a.house_id,a.old_house_id,a.snz_idi_address_register_uid;

/* How many times address appears on that date */
drop table if exists #chps3b;
select 
a.house_id
,a.old_house_id
,a.snz_idi_address_register_uid
,case when a.most_recent_address_date = '1900-01-01' then null else a.most_recent_address_date  end as most_recent_address_date
,count(*) as count_houses_address
into #chps3b
from #chps3a a
left join #tenancy_snpshot_clean_$(YYYYMM) b on a.snz_idi_address_register_uid = b.snz_idi_address_register_uid and b.snapshot_date = a.most_recent_address_date
group by a.house_id,a.old_house_id,a.snz_idi_address_register_uid,a.most_recent_address_date;

/* For those with only one matching address use that as old house id */
drop table if exists #chps3c;
select 
a.house_id
,coalesce(a.old_house_id,b.house_id) as old_house_id
,a.snz_idi_address_register_uid
,a.most_recent_address_date
into #chps3c
from #chps3b a
left join #tenancy_snpshot_clean_$(YYYYMM) b on a.snz_idi_address_register_uid = b.snz_idi_address_register_uid and b.snapshot_date = a.most_recent_address_date and a.count_houses_address = 1;

/* Merge on bedrooms from houses dataset */
drop table if exists #chps4;
select 
a.house_id
,a.old_house_id
,a.snz_idi_address_register_uid
,a.most_recent_address_date
,b.bedrooms
,b.snz_idi_address_register_uid as oth_add
into #chps4
from #chps3c a
left join #houses_snapshot_clean_$(YYYYMM) b on a.old_house_id = b.house_id and b.snapshot_date = a.most_recent_address_date;

/* For some the old house id is also a CHP, so merge on these again */
drop table if exists #chps4a;
select distinct 
a.house_id
,a.old_house_id
,a.snz_idi_address_register_uid
,a.most_recent_address_date
,coalesce(a.bedrooms,b.bedrooms) as bedrooms
,coalesce(a.snz_idi_address_register_uid,b.snz_idi_address_register_uid) as oth_add
,iif(b.house_id is null,0,1) as old_CHP
into #chps4a
from #chps4 a
left join #chps4 b on a.old_house_id = b.house_id;

/* Merge with all CHP dates */
drop table if exists #chps5;
select 
a.house_id
,a.[household_id]
,a.[snapshot_date]
,b.[bedrooms]
,a.[market_rent]
,null as [rent_date]
,null as [lease_status]
,null as [lease_expiry_date]
,'AVAILABLE-OCCUPIED' as occupancy_status
,b.[snz_idi_address_register_uid]
,a.[meshblock_code] as hnz_meshblock
,a.[meshblock_code] as snz_meshblock
,a.[meshblock_code] as meshblock_2013
,null as [postcode]
into #chps5
from #tenancy_snpshot_clean_$(YYYYMM) a
left join #chps4a b on a.house_id = b.house_id
where a.provider_name = 'CHP';

/* Create dataset for CHPs */
drop table if exists #houses_snapshot_CHPs_$(YYYYMM);
create table #houses_snapshot_CHPs_$(YYYYMM) (
	house_id int not null,
	household_id int null,
	snapshot_date date not null,
	bedrooms int null,
	market_rent float null,
	rent_date date null,
	lease_status varchar(20) null ,
	lease_expiry_date date null,
	occupancy_status varchar(100) null,
	snz_idi_address_register_uid int null,
	hnz_meshblock varchar(50) null,
	snz_meshblock char(7) null,
	meshblock_2013 int null,
	postcode int null);

insert into #houses_snapshot_CHPs_$(YYYYMM)

select 
[house_id]
,[household_id]
,[snapshot_date]
,[bedrooms]
,[market_rent]
,null as [rent_date]
,null as [lease_status]
,null as [lease_expiry_date]
,occupancy_status
,[snz_idi_address_register_uid]
,hnz_meshblock
,snz_meshblock
,[meshblock_2013]
,[postcode]
from #chps5 a;

/* Create the Social housing tenancy spells output dataset */
drop table if exists [$(targetdb)].[$(targetschema)].[sh_tenancy_spells];

/* Get house detailed for HNZ and CHP houses */
with houses_info as (
select 
house_id
,household_id
,max(isnull(snz_idi_address_register_uid,-1)) as snz_idi_address_register_uid
,max(isnull(snz_meshblock,-1)) as meshblock_code
,max(isnull(bedrooms,-1)) as bedrooms
from #houses_snapshot_clean_$(YYYYMM)
group by house_id,household_id
union
select 
house_id
,household_id
,max(isnull(snz_idi_address_register_uid,-1)) as snz_idi_address_register_uid
,max(isnull(snz_meshblock,-1)) as meshblock_code
,max(isnull(bedrooms,-1)) as bedrooms
from #houses_snapshot_CHPs_$(YYYYMM)
group by house_id,household_id
)

,tenancies as (
select distinct
snz_uid
,a.house_id
,a.household_id
,b.snz_idi_address_register_uid
,'hnz_tenancy' as [source]
,spell_start =max(isnull(tenancy_start_date,'1900-01-01'))
,spell_end =max(isnull(exit_date,'1900-01-01'))
,bedrooms
,household_size =max(isnull(hshd_size,-1))
,exit_type
,b.meshblock_code
from #tenancy_snpshot_clean_$(YYYYMM) a
left join houses_info b on a.house_id = b.house_id and a.household_id = b.household_id
group by snz_uid
,a.house_id
,a.household_id
,b.snz_idi_address_register_uid
,bedrooms
,exit_type
,b.meshblock_code
)

/* Get integers for spells */
,integers as (
SELECT 
*
,spell_start_key =YEAR(spell_start)*100 + MONTH(spell_start)
,day_key =RIGHT(spell_start,2)
FROM tenancies
)

/* Get the next spell start for an individual */
,nextspell as (
SELECT 
*
,lead_nextspell =LEAD([spell_start_key],1,0) OVER(PARTITION BY [snz_uid] ORDER BY [snz_uid], [spell_start_key]) 
,lead_day =LEAD([day_key],1,0) OVER(PARTITION BY [snz_uid] ORDER BY [snz_uid], [spell_start_key]) 
FROM integers
)

/* Where there is a following spell, get the date for a month on either side of spell end */
,withinmonth as (
SELECT 
*
,lead_date =CASE WHEN lead_nextspell <> 0 THEN CAST(CONCAT(LEFT(lead_nextspell,4),'-',RIGHT(lead_nextspell,2),'-',[lead_day]) AS DATE) ELSE NULL END
,end_minus1m =CASE WHEN lead_nextspell <> 0 THEN DATEADD(MONTH,-1,spell_end) ELSE NULL END
,end_plus1m =CASE WHEN lead_nextspell <> 0 THEN DATEADD(MONTH,1,spell_end) ELSE NULL END
FROM nextspell
)

/* Identify spells that started within a month either side of preceding spell end */

select 
snz_uid
,house_id
,household_id
,case when snz_idi_address_register_uid = -1 then null else snz_idi_address_register_uid end as snz_idi_address_register_uid
,[source]
,case when spell_start = '1900-01-01' then null else spell_start end as spell_start
,case when spell_end = '1900-01-01' then null else spell_end end as spell_end
,adjacent_tenancy_spell = case when lead_date between end_minus1m and end_plus1m then 1 
                               when lead_nextspell = 0 then null else 0 end
,case when bedrooms = -1 then null else bedrooms end as bedrooms
,case when household_size = -1 then null else household_size end as household_size
,exit_type
,case when meshblock_code = -1 then null else meshblock_code end as meshblock_code
from withinmonth
order by snz_uid,spell_start