8000 Additional households fixes based on testing by dehall · Pull Request #58 · mitre/data-owner-tools · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Additional households fixes based on testing #58

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions households.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@ def parse_arguments():
" Smaller numbers may result in out of memory errors. Larger numbers"
" may increase runtime. Default is 4",
)
parser.add_argument(
"--exact_addresses",
action="store_true",
help="Use exact matches on address as the definition of a household."
" By default the inference process will split up addresses into"
" street, number, suffix, etc, and considers phone # and family name"
" when making a determination which records belong to which household."
" Enabling this feature causes the process to use the entire address"
" as a single string for comparisons, and only the address. "
" If addresses have not been standardized/validated, this setting"
" will likely increase false negatives (records not being included "
" in households where they should be).",
)
parser.add_argument(
"--pairsfile",
help="Location of matching pairs file",
Expand Down Expand Up @@ -179,7 +192,7 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
# so it can be traversed sort of like a graph from any given patient
# note the key is patient position within the pii_lines dataframe
pos_to_pairs = get_household_matches(
pii_lines, args.split_factor, args.debug, args.pairsfile
pii_lines, args.split_factor, args.debug, args.exact_addresses, args.pairsfile
)

mapping_file = Path(args.mappingfile)
Expand Down Expand Up @@ -207,12 +220,13 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
pii_lines["written_to_file"] = False
hclk_position = 0
lines_processed = 0
hh_sizes = []
five_percent = int(len(pii_lines) / 20)
# Match households
for position, line in pii_lines.sample(frac=1).iterrows():
for position, _line in pii_lines.sample(frac=1).iterrows():
# sample(frac=1) shuffles the entire dataframe
# note that "position" is the index and still relative to the original

line = pii_lines.loc[position]
lines_processed += 1

if args.debug and (lines_processed % five_percent) == 0:
Expand All @@ -223,20 +237,22 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar

if line["written_to_file"]:
continue
line["written_to_file"] = True

if position in pos_to_pairs:
pat_positions = bfs_traverse_matches(pos_to_pairs, position)
# map those row numbers to PATIDs
pat_ids = list(
map(lambda p: pii_lines.at[p, "record_id"], pat_positions)
)
# mark all these rows as written to file
pii_lines.loc[pat_positions, ["written_to_file"]] = True
else:
pat_positions = [position]
pat_ids = [line[0]]

# mark all these rows as written to file
pii_lines.loc[pat_positions, ["written_to_file"]] = True

hh_sizes.append(len(pat_positions))

string_pat_positions = [str(p) for p in pat_positions]
pat_string = ",".join(string_pat_positions)
mapping_writer.writerow([hclk_position, pat_string])
Expand All @@ -258,6 +274,12 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
]
hclk_position += 1
pii_writer.writerow(output_row)

hh_sizes_series = pd.Series(hh_sizes, dtype=int)

print("Household size stats:")
print(hh_sizes_series.describe())

return n_households


Expand Down
170 changes: 114 additions & 56 deletions households/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,22 @@
from definitions import TIMESTAMP_FMT

MATCH_THRESHOLD = 0.85
FN_WEIGHT = 0.2
PHONE_WEIGHT = 0.15
ADDR_WEIGHT = 0.35
ZIP_WEIGHT = 0.3
FN_WEIGHT = 0.25
PHONE_WEIGHT = 0.2
ADDR_WEIGHT = 0.55
# ZIP_WEIGHT = 0.25
# zip is not used in weighting since all candidate pairs match on zip

# a separate address threshold so that pairs with medium-low scores across all fields
# don't wind up getting matched anyway
ADDR_THRESHOLD = 0.95
# using address_distance() below:
# "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333
# "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444
# "205 GARDEN STREET" v "205 GAREDN ST" --> 0.9666
# "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472
# so 0.95 should give us a good balance of not linking all apartments together
# while still allowing some room for typos and variation


def addr_parse(addr):
Expand Down Expand Up @@ -53,6 +65,23 @@ def address_distance(addr1, addr2):
score = 0
secondary_score = 0

a1 = addr1["household_street_address"]
a2 = addr2["household_street_address"]

if not a1 or not a2:
# if either is blank they get a score of 0
# this matches textdistance.jaro_winkler("", x)
# but textdistance.jaro_winkler("", "") is normally 1
# without this, 2 missing addresses could be a "perfect match"
# which is not what we want
return 0

if a1 == a2:
# if the strings are exactly identical,
# don't waste time with detailed comparisons
# this matches textdistance.jaro_winkler(x, x)
return 1

# Change weights based on existence of second level address
if (
not addr1["prefix"]
Expand Down Expand Up @@ -213,15 +242,10 @@ def address_distance(addr1, addr2):

# See if simple string compare of all things combined
# with a 0.6 adjustment is better
a1 = addr1["household_street_address"]
a2 = addr2["household_street_address"]
if a1 and a2:
score = max(
score,
textdistance.jaro_winkler(a1, a2)
* (weight_number + weight_street_name)
* 0.6,
) + (secondary_score * weight_secondary)
score = max(
score,
textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6,
) + (secondary_score * weight_secondary)
return score


Expand Down Expand Up @@ -271,7 +295,9 @@ def explode_address(row):
return parsed


def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None):
def get_household_matches(
pii_lines, split_factor=4, debug=False, exact_addresses=False, pairsfile=None
):
if pairsfile:
if debug:
print(f"[{datetime.now()}] Loading matching pairs file")
Expand All @@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
print(f"[{datetime.now()}] Done loading matching pairs")

else:
# break out the address into number, street, suffix, etc,
# so we can prefilter matches based on those
addr_cols = pii_lines.apply(
explode_address,
axis="columns",
result_type="expand",
)
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")

if exact_addresses:
pii_lines_exploded = pii_lines
else:
# break out the address into number, street, suffix, etc,
# so we can prefilter matches based on those
addr_cols = pii_lines.apply(
explode_address,
axis="columns",
result_type="expand",
)
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")

if debug:
print(f"[{datetime.now()}] Done pre-processing PII file")

candidate_links = get_candidate_links(pii_lines_exploded, split_factor, debug)
gc.collect()

matching_pairs = get_matching_pairs(
pii_lines_exploded, candidate_links, split_factor, debug
candidate_links = get_candidate_links(
pii_lines_exploded, split_factor, exact_addresses, debug
)
del candidate_links
del pii_lines_exploded
gc.collect()

if exact_addresses:
# the candidate links are already all the pairs with matching [address, zip]
matching_pairs = candidate_links
else:
matching_pairs = get_matching_pairs(
pii_lines_exploded,
candidate_links,
split_factor,
exact_addresses,
debug,
)
del pii_lines_exploded
del candidate_links
gc.collect()

if debug:
timestamp = datetime.now().strftime(TIMESTAMP_FMT)
pairs_path = Path("temp-data") / f"households_pairs-{timestamp}.csv"
Expand Down Expand Up @@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
return pos_to_pairs


def get_candidate_links(pii_lines, split_factor=4, debug=False):
def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=False):
# indexing step defines the pairs of records for compari 5D32 son
# indexer.full() does a full n^2 comparison, but we can do better
indexer = recordlinkage.Index()
# use two block indexes to reduce the number of candidates
# use block indexes to reduce the number of candidates
# while still retaining enough candidates to identify real households.
# a block only on zip could work, but seems to run into memory issues
# note sortedneighborhood on zip probably doesn't make sense
# (zip codes in a geographic area will be too similar)
# but if data is dirty then blocks may discard typos

indexer.block(["household_zip", "street", "number"])
indexer.block(["household_zip", "family_name"])
if exact_addresses:
indexer.block(["household_zip", "household_street_address"])
else:
indexer.block(["household_zip", "street", "number"])
indexer.block(["household_zip", "family_name"])

candidate_links = None
# start with an empty index we can append to
candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])

# break up the dataframe into subframes,
# and iterate over every pair of subframes.
Expand Down Expand Up @@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False):
pairs_subset = pairs_subset[pairs_subset[0] < pairs_subset[1]]
pairs_subset = pd.MultiIndex.from_frame(pairs_subset)

if candidate_links is None:
candidate_links = pairs_subset
else:
candidate_links = candidate_links.append(pairs_subset)
candidate_links = candidate_links.append(pairs_subset)< AE96 /td>

gc.collect()

# rows with blank address match ("" == "") so drop those here
# TODO: ideally we wouldn't compare blank address lines in the first place
# but the indexing and splitting bits get complicated if we drop them earlier
blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")

if debug:
print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")

return candidate_links


def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
def get_matching_pairs(
pii_lines, candidate_links, split_factor, exact_addresses, debug
):
# Comparison step performs the defined comparison algorithms
# against the candidate pairs
compare_cl = recordlinkage.Compare()
Expand All @@ -428,24 +478,35 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
compare_cl.string(
"phone_number", "phone_number", method="jarowinkler", label="phone_number"
)
compare_cl.add(
AddressComparison(
"exploded_address",
"exploded_address",
if exact_addresses:
compare_cl.string(
"household_street_address",
"household_street_address",
method="jarowinkler",
label="household_street_address",
)
)
compare_cl.string(
"household_zip", "household_zip", method="levenshtein", label="household_zip"
)
else:
compare_cl.add(
AddressComparison(
"exploded_address",
"exploded_address",
label="household_street_address",
)
)

# NOTE: zip code is DISABLED because our indexes block on zip code
# compare_cl.string(
# "household_zip", "household_zip", method="levenshtein", label="household_zip"
# )
# note: hamming distance is not implemented in this library,
# but levenshtein is. the two metrics are likely similar enough
# that it's not worth implementing hamming again

if debug:
print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs")

matching_pairs = None
# start with an empty index we can append to
matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
# we know that we could support len(subset_A) in memory above,
# so use the same amount here
len_subset_A = int(len(pii_lines) / split_factor)
Expand All @@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):

features = compare_cl.compute(subset_links, relevant_pii_lines)

# first filter by address similarity
features = features[features["household_street_address"] > ADDR_THRESHOLD]

features["family_name"] *= FN_WEIGHT
features["phone_number"] *= PHONE_WEIGHT
features["household_street_address"] *= ADDR_WEIGHT
features["household_zip"] *= ZIP_WEIGHT
# features["household_zip"] *= ZIP_WEIGHT

# filter the matches down based on the cumulative score
matches = features[features.sum(axis=1) > MATCH_THRESHOLD]

if matching_pairs is None:
matching_pairs = matches.index
else:
matching_pairs = matching_pairs.append(matches.index)
matching_pairs = matching_pairs.append(matches.index)
# matching pairs are bi-directional and not duplicated,
# ex if (1,9) is in the list then (9,1) won't be

Expand All @@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
del matches
gc.collect()

# drop exploded address because it's not used past this point
pii_lines.drop(columns=["exploded_address"], inplace=True)

if debug:
print(f"[{datetime.now()}] Found {len(matching_pairs)} matching pairs")

Expand Down
Loading
0