blocks-transformer/pre_processing.py
Admin User 95034efa4e
All checks were successful
Build and Push Docker Image / test (push) Successful in 2m30s
Build and Push Docker Image / build_and_push (push) Successful in 4m50s
Series M v1 model
2025-06-13 18:01:40 +00:00

360 lines
20 KiB
Python

import math
lookup_dict = {
"balmag04": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 427.0
},
"utlmag01": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 446.0
},
"utlmag02": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 486.0
},
"utlmag03": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 492.0
},
"utlmag04": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 414.0
},
"mnpmag03": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 494.0
},
"duemag01": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 488.0
},
"trv01": {
"data_type": "int", "valid_min": 1.0, "valid_max": 24.0, "default_treatment_type": "unk", "observed_cap_min_value": 1.0, "observed_cap_max_value": 24.0
},
"trv02": {
"data_type": "int", "valid_min": 0.0, "valid_max": 12.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 12.0
},
"index01": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 999.0
},
"index02": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 999.0
},
"rvlr75": {
"data_type": "float", "valid_min": 0.0, "valid_max": 5.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 5.0
},
"rvlr77": {
"data_type": "float", "valid_min": 0.0, "valid_max": 5.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 5.0
},
"rev12": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 109.0
},
"rev13": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 82.0
},
"rev14": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 56.0
},
"rev54": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 9.0
},
"rev84": {
"data_type": "int", "valid_min": 0.0, "valid_max": 23.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"bkc14": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 47.0
},
"bkc84": {
"data_type": "int", "valid_min": 0.0, "valid_max": 23.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"ret12": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 42.0
},
"evtg04": {
"data_type": "int", "valid_min": 300.0, "valid_max": 850.0, "default_treatment_type": "unk", "observed_cap_min_value": 300.0, "observed_cap_max_value": 845.0
},
"rev201": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"rev202": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"rev203": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"rev223": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 49.0
},
"rev224": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 47.0
},
"rev225": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 45.0
},
"walshr02": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 2.0
},
"rev231": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 18101.0
},
"rev232": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 38963.0
},
"rev233": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 68709.0
},
"rev252": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 13988.0
},
"rev253": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 12347.0
},
"all231": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 38949.0
},
"at28a": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 1330166.0
},
"at28b": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 351273.0
},
"at36s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 999.0
},
"bc20s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 510.0
},
"bc21s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 190.0
},
"bc28s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 205768.0
},
"bc36s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 999.0
},
"bc97a": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 184747.0
},
"bc98a": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 186395.0
},
"bc102s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 31000.0
},
"bc104s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 99.0
},
"bc107s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 15.0
},
"br20s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 495.0
},
"fi21s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 162.0
},
"fi34s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 100.0
},
"fi35s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 70016.0
},
"g051s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"g102s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"g105s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"g201a": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 255142.0
},
"g221d": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 1.0
},
"g232s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 21.0
},
"g250a": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 13.0
},
"g960s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 17.0
},
"g990s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 10.0
},
"mt20s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 261.0
},
"mt34s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 101.0
},
"pb28s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 192100.0
},
"pb34s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 100.0
},
"re28s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 311518.0
},
"re36s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 999.0
},
"re102s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 41750.0
},
"s004s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 179.0
},
"s114s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 5.0
},
"st32s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 221686.0
},
"g106s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 590.0
},
"g242b": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"us21s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 153.0
},
"us34s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 101.0
},
"g403s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 5.0
},
"g405s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"g408s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"g411s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 8.0
},
"g416s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"g417s": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 23.0
},
"agg901": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"agg902": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"agg908": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 69376.0
},
"agg911": {
"data_type": "float", "valid_min": 0.0, "valid_max": 10000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 107.0
},
"rle904": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 309634.0
},
"p02d": {
"data_type": "int", "valid_min": 0.0, "valid_max": 99.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 3.0
},
"p02h": {
"data_type": "int", "valid_min": 0.0, "valid_max": 99.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 7.0
},
"balmag01": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 474.0
},
"balmag02": {
"data_type": "int", "valid_min": 0.0, "valid_max": 600.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 516.0
},
"cv13": {
"data_type": "int", "valid_min": 0.0, "valid_max": 100.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"cv17": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 9.0
},
"cv21": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 171153.0
},
"cv25": {
"data_type": "int", "valid_min": 0.0, "valid_max": 100.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 33.0
},
"ct319": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 98.0
},
"ct320": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 52.0
},
"cta11": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 71.0
},
"cta20": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 45.0
},
"cta21": {
"data_type": "int", "valid_min": 0.0, "valid_max": 999.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 148.0
},
"paymnt08": {
"data_type": "float", "valid_min": 0.0, "valid_max": 50.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 50.0
},
"rev321": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 234408.0
},
"rev322": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 219698.0
},
"bkc321": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 171024.0
},
"bkc322": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 162590.0
},
"bkc323": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 152068.0
},
"bkc324": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 136763.0
},
"bkc327": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 171143.0
},
"bkc328": {
"data_type": "int", "valid_min": 0.0, "valid_max": 1000000000.0, "default_treatment_type": "unk", "observed_cap_min_value": 0.0, "observed_cap_max_value": 164352.0
},
"eads66": {
"data_type": "float", "valid_min": 300.0, "valid_max": 850.0, "default_treatment_type": "unk", "observed_cap_min_value": 300.0, "observed_cap_max_value": 820.0
}
}
# 1. Pre-processing: type casting and variable treatment
def pre_processing(input_dict):
processed = {}
for var, cfg in lookup_dict.items():
val = input_dict.get(var)
# Cast to required type
try:
if cfg.get("data_type") == "int":
val = int(val)
elif cfg.get("data_type") == "float":
val = float(val)
except (ValueError, TypeError):
val = None
# Variable treatment
vmin = cfg.get("valid_min")
vmax = cfg.get("valid_max")
if val is not None and not (math.isnan(vmin) or math.isnan(vmax)):
if cfg.get("default_treatment_type") == "unk":
if val < vmin or val > vmax:
val = float("nan")
cmin = cfg.get("observed_cap_min_value")
cmax = cfg.get("observed_cap_max_value")
if not math.isnan(val):
if cmin is not None and vmin <= val < cmin:
val = cmin
if cmax is not None and cmax < val <= vmax:
val = cmax
processed[var] = val
return processed