diff --git a/households.py b/households.py index b2bb11e..0aed649 100644 --- a/households.py +++ b/households.py @@ -75,6 +75,19 @@ def parse_arguments(): " Smaller numbers may result in out of memory errors. Larger numbers" " may increase runtime. Default is 4", ) + parser.add_argument( + "--exact_addresses", + action="store_true", + help="Use exact matches on address as the definition of a household." + " By default the inference process will split up addresses into" + " street, number, suffix, etc, and considers phone # and family name" + " when making a determination which records belong to which household." + " Enabling this feature causes the process to use the entire address" + " as a single string for comparisons, and only the address. " + " If addresses have not been standardized/validated, this setting" + " will likely increase false negatives (records not being included " + " in households where they should be).", + ) parser.add_argument( "--pairsfile", help="Location of matching pairs file", @@ -179,7 +192,7 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar # so it can be traversed sort of like a graph from any given patient # note the key is patient position within the pii_lines dataframe pos_to_pairs = get_household_matches( - pii_lines, args.split_factor, args.debug, args.pairsfile + pii_lines, args.split_factor, args.debug, args.exact_addresses, args.pairsfile ) mapping_file = Path(args.mappingfile) @@ -207,12 +220,13 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar pii_lines["written_to_file"] = False hclk_position = 0 lines_processed = 0 + hh_sizes = [] five_percent = int(len(pii_lines) / 20) # Match households - for position, line in pii_lines.sample(frac=1).iterrows(): + for position, _line in pii_lines.sample(frac=1).iterrows(): # sample(frac=1) shuffles the entire dataframe # note that "position" is the index and still relative to the original - + line = pii_lines.loc[position] lines_processed += 1 if args.debug and (lines_processed % five_percent) == 0: @@ -223,7 +237,6 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar if line["written_to_file"]: continue - line["written_to_file"] = True if position in pos_to_pairs: pat_positions = bfs_traverse_matches(pos_to_pairs, position) @@ -231,12 +244,15 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar pat_ids = list( map(lambda p: pii_lines.at[p, "record_id"], pat_positions) ) - # mark all these rows as written to file - pii_lines.loc[pat_positions, ["written_to_file"]] = True else: pat_positions = [position] pat_ids = [line[0]] + # mark all these rows as written to file + pii_lines.loc[pat_positions, ["written_to_file"]] = True + + hh_sizes.append(len(pat_positions)) + string_pat_positions = [str(p) for p in pat_positions] pat_string = ",".join(string_pat_positions) mapping_writer.writerow([hclk_position, pat_string]) @@ -258,6 +274,12 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar ] hclk_position += 1 pii_writer.writerow(output_row) + + hh_sizes_series = pd.Series(hh_sizes, dtype=int) + + print("Household size stats:") + print(hh_sizes_series.describe()) + return n_households diff --git a/households/matching.py b/households/matching.py index 6ba6429..93c8e19 100644 --- a/households/matching.py +++ b/households/matching.py @@ -13,10 +13,22 @@ from definitions import TIMESTAMP_FMT MATCH_THRESHOLD = 0.85 -FN_WEIGHT = 0.2 -PHONE_WEIGHT = 0.15 -ADDR_WEIGHT = 0.35 -ZIP_WEIGHT = 0.3 +FN_WEIGHT = 0.25 +PHONE_WEIGHT = 0.2 +ADDR_WEIGHT = 0.55 +# ZIP_WEIGHT = 0.25 +# zip is not used in weighting since all candidate pairs match on zip + +# a separate address threshold so that pairs with medium-low scores across all fields +# don't wind up getting matched anyway +ADDR_THRESHOLD = 0.95 +# using address_distance() below: +# "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333 +# "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444 +# "205 GARDEN STREET" v "205 GAREDN ST" --> 0.9666 +# "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472 +# so 0.95 should give us a good balance of not linking all apartments together +# while still allowing some room for typos and variation def addr_parse(addr): @@ -53,6 +65,23 @@ def address_distance(addr1, addr2): score = 0 secondary_score = 0 + a1 = addr1["household_street_address"] + a2 = addr2["household_street_address"] + + if not a1 or not a2: + # if either is blank they get a score of 0 + # this matches textdistance.jaro_winkler("", x) + # but textdistance.jaro_winkler("", "") is normally 1 + # without this, 2 missing addresses could be a "perfect match" + # which is not what we want + return 0 + + if a1 == a2: + # if the strings are exactly identical, + # don't waste time with detailed comparisons + # this matches textdistance.jaro_winkler(x, x) + return 1 + # Change weights based on existence of second level address if ( not addr1["prefix"] @@ -213,15 +242,10 @@ def address_distance(addr1, addr2): # See if simple string compare of all things combined # with a 0.6 adjustment is better - a1 = addr1["household_street_address"] - a2 = addr2["household_street_address"] - if a1 and a2: - score = max( - score, - textdistance.jaro_winkler(a1, a2) - * (weight_number + weight_street_name) - * 0.6, - ) + (secondary_score * weight_secondary) + score = max( + score, + textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6, + ) + (secondary_score * weight_secondary) return score @@ -271,7 +295,9 @@ def explode_address(row): return parsed -def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None): +def get_household_matches( + pii_lines, split_factor=4, debug=False, exact_addresses=False, pairsfile=None +): if pairsfile: if debug: print(f"[{datetime.now()}] Loading matching pairs file") @@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None print(f"[{datetime.now()}] Done loading matching pairs") else: - # break out the address into number, street, suffix, etc, - # so we can prefilter matches based on those - addr_cols = pii_lines.apply( - explode_address, - axis="columns", - result_type="expand", - ) - pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns") + + if exact_addresses: + pii_lines_exploded = pii_lines + else: + # break out the address into number, street, suffix, etc, + # so we can prefilter matches based on those + addr_cols = pii_lines.apply( + explode_address, + axis="columns", + result_type="expand", + ) + pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns") if debug: print(f"[{datetime.now()}] Done pre-processing PII file") - candidate_links = get_candidate_links(pii_lines_exploded, split_factor, debug) - gc.collect() - - matching_pairs = get_matching_pairs( - pii_lines_exploded, candidate_links, split_factor, debug + candidate_links = get_candidate_links( + pii_lines_exploded, split_factor, exact_addresses, debug ) - del candidate_links - del pii_lines_exploded gc.collect() + if exact_addresses: + # the candidate links are already all the pairs with matching [address, zip] + matching_pairs = candidate_links + else: + matching_pairs = get_matching_pairs( + pii_lines_exploded, + candidate_links, + split_factor, + exact_addresses, + debug, + ) + del pii_lines_exploded + del candidate_links + gc.collect() + if debug: timestamp = datetime.now().strftime(TIMESTAMP_FMT) pairs_path = Path("temp-data") / f"households_pairs-{timestamp}.csv" @@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None return pos_to_pairs -def get_candidate_links(pii_lines, split_factor=4, debug=False): +def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=False): # indexing step defines the pairs of records for comparison # indexer.full() does a full n^2 comparison, but we can do better indexer = recordlinkage.Index() - # use two block indexes to reduce the number of candidates + # use block indexes to reduce the number of candidates # while still retaining enough candidates to identify real households. # a block only on zip could work, but seems to run into memory issues # note sortedneighborhood on zip probably doesn't make sense # (zip codes in a geographic area will be too similar) # but if data is dirty then blocks may discard typos - indexer.block(["household_zip", "street", "number"]) - indexer.block(["household_zip", "family_name"]) + if exact_addresses: + indexer.block(["household_zip", "household_street_address"]) + else: + indexer.block(["household_zip", "street", "number"]) + indexer.block(["household_zip", "family_name"]) - candidate_links = None + # start with an empty index we can append to + candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1]) # break up the dataframe into subframes, # and iterate over every pair of subframes. @@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False): pairs_subset = pairs_subset[pairs_subset[0] < pairs_subset[1]] pairs_subset = pd.MultiIndex.from_frame(pairs_subset) - if candidate_links is None: - candidate_links = pairs_subset - else: - candidate_links = candidate_links.append(pairs_subset) + candidate_links = candidate_links.append(pairs_subset) gc.collect() + # rows with blank address match ("" == "") so drop those here + # TODO: ideally we wouldn't compare blank address lines in the first place + # but the indexing and splitting bits get complicated if we drop them earlier + blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index + candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore") + candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore") + if debug: print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs") return candidate_links -def get_matching_pairs(pii_lines, candidate_links, split_factor, debug): +def get_matching_pairs( + pii_lines, candidate_links, split_factor, exact_addresses, debug +): # Comparison step performs the defined comparison algorithms # against the candidate pairs compare_cl = recordlinkage.Compare() @@ -428,16 +478,26 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug): compare_cl.string( "phone_number", "phone_number", method="jarowinkler", label="phone_number" ) - compare_cl.add( - AddressComparison( - "exploded_address", - "exploded_address", + if exact_addresses: + compare_cl.string( + "household_street_address", + "household_street_address", + method="jarowinkler", label="household_street_address", ) - ) - compare_cl.string( - "household_zip", "household_zip", method="levenshtein", label="household_zip" - ) + else: + compare_cl.add( + AddressComparison( + "exploded_address", + "exploded_address", + label="household_street_address", + ) + ) + + # NOTE: zip code is DISABLED because our indexes block on zip code + # compare_cl.string( + # "household_zip", "household_zip", method="levenshtein", label="household_zip" + # ) # note: hamming distance is not implemented in this library, # but levenshtein is. the two metrics are likely similar enough # that it's not worth implementing hamming again @@ -445,7 +505,8 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug): if debug: print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs") - matching_pairs = None + # start with an empty index we can append to + matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1]) # we know that we could support len(subset_A) in memory above, # so use the same amount here len_subset_A = int(len(pii_lines) / split_factor) @@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug): features = compare_cl.compute(subset_links, relevant_pii_lines) + # first filter by address similarity + features = features[features["household_street_address"] > ADDR_THRESHOLD] + features["family_name"] *= FN_WEIGHT features["phone_number"] *= PHONE_WEIGHT features["household_street_address"] *= ADDR_WEIGHT - features["household_zip"] *= ZIP_WEIGHT + # features["household_zip"] *= ZIP_WEIGHT # filter the matches down based on the cumulative score matches = features[features.sum(axis=1) > MATCH_THRESHOLD] - if matching_pairs is None: - matching_pairs = matches.index - else: - matching_pairs = matching_pairs.append(matches.index) + matching_pairs = matching_pairs.append(matches.index) # matching pairs are bi-directional and not duplicated, # ex if (1,9) is in the list then (9,1) won't be @@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug): del matches gc.collect() - # drop exploded address because it's not used past this point - pii_lines.drop(columns=["exploded_address"], inplace=True) - if debug: print(f"[{datetime.now()}] Found {len(matching_pairs)} matching pairs") diff --git a/linkid_to_patid.py b/linkid_to_patid.py index 36cd21f..38a86d2 100644 --- a/linkid_to_patid.py +++ b/linkid_to_patid.py @@ -61,8 +61,9 @@ def parse_source_file(source_file): def write_patid_links(args): links_archive = Path(args.linkszip) pii_lines = parse_source_file(args.sourcefile) + filepath = os.path.join(args.outputdir, "linkid_to_patid.csv") with open( - os.path.join(args.outputdir, "linkid_to_patid.csv"), + filepath, "w", newline="", encoding="utf-8", @@ -87,13 +88,15 @@ def write_patid_links(args): # The +1 accounts for the header row in spreadsheet index patid = pii_lines[int(row[1]) + 1][0] writer.writerow([link_id, patid]) + print(f"Wrote {filepath}") def write_hh_links(args): hh_links_file = Path(args.hhlinkszip) hh_pii_lines = parse_source_file(args.hhsourcefile) + filepath = os.path.join(args.outputdir, "householdid_to_patid.csv") with open( - os.path.join(args.outputdir, "householdid_to_patid.csv"), + filepath, "w", newline="", encoding="utf-8", @@ -126,6 +129,7 @@ def write_hh_links(args): for record_id in record_ids_list: writer.writerow([household_id, record_id]) + print(f"Wrote {filepath}") def translate_linkids(args):