python bisect 工具以及应用
主要用于在顺序固定的序列中查找以及插入
内置了四种方法
- bisect_left
- bisect_right
- insort_right
- insort_left
bisect_left/right 方法找到应该插入元素的位置,对于和序列中元素不相同的值,两个方法返回的一样,对于相同的值,left返回相同值的位置,right返回相同值下一个位置
例如在ps = [1, 3, 5, 9, 9, 100]
query = 9
的话,left返回3,right返回5
import bisect
import pandas as pd
import numpy as np
from functools import reduce
from typing import Listimport tensorflow as tf
import torch
from sklearn.preprocessing import KBinsDiscretizer# Bisection algorithms. 源码阅读
def insort_right(a, x, lo=0, hi=None, *, key=None):"""Insert item x in list a, and keep it sorted assuming a is sorted.If x is already in a, insert it to the right of the rightmost x.Optional args lo (default 0) and hi (default len(a)) bound theslice of a to be searched.A custom key function can be supplied to customize the sort order."""if key is None:lo = bisect_right(a, x, lo, hi)else:lo = bisect_right(a, key(x), lo, hi, key=key)a.insert(lo, x)def insort_left(a, x, lo=0, hi=None, *, key=None):"""Insert item x in list a, and keep it sorted assuming a is sorted.If x is already in a, insert it to the left of the leftmost x.Optional args lo (default 0) and hi (default len(a)) bound theslice of a to be searched.A custom key function can be supplied to customize the sort order."""if key is None:lo = bisect_left(a, x, lo, hi)else:lo = bisect_left(a, key(x), lo, hi, key=key)a.insert(lo, x)def bisect_right(a, x, lo=0, hi=None, *, key=None):"""Return the index where to insert item x in list a, assuming a is sorted.The return value i is such that all e in a[:i] have e <= x, and all e ina[i:] have e > x. So if x already appears in the list, a.insert(i, x) willinsert just after the rightmost x already there.Optional args lo (default 0) and hi (default len(a)) bound theslice of a to be searched.A custom key function can be supplied to customize the sort order."""if lo < 0:raise ValueError('lo must be non-negative')if hi is None:hi = len(a)# Note, the comparison uses "<" to match the# __lt__() logic in list.sort() and in heapq.if key is None:while lo < hi:mid = (lo + hi) // 2if x < a[mid]:hi = midelse:lo = mid + 1else:while lo < hi:mid = (lo + hi) // 2if x < key(a[mid]):hi = midelse:lo = mid + 1return lodef bisect_left(a, x, lo=0, hi=None, *, key=None):"""Return the index where to insert item x in list a, assuming a is sorted。The return value i is such that all e in a[:i] have e < x, and all e ina[i:] have e >= x. So if x already appears in the list, a.insert(i, x) willinsert just before the leftmost x already there.Optional args lo (default 0) and hi (default len(a)) bound theslice of a to be searched.A custom key function can be supplied to customize the sort order."""if lo < 0:raise ValueError('lo must be non-negative')if hi is None:hi = len(a)# Note, the comparison uses "<" to match the# __lt__() logic in list.sort() and in heapq.if key is None:while lo < hi:mid = (lo + hi) // 2if a[mid] < x:lo = mid + 1else:hi = midelse:while lo < hi:mid = (lo + hi) // 2if key(a[mid]) < x:lo = mid + 1else:hi = midreturn lo# Application
# 二分查找函数
ps = [1, 3, 5, 9, 100]
T = 66
print(bisect.bisect_left(ps, T, lo=0, hi=len(ps))) # 二分左边界
print(bisect.bisect_right(ps, T, lo=0, hi=len(ps))) # 二分右边界
print(bisect.bisect_left(ps, 9, lo=0, hi=len(ps))) # 二分左边界
print(bisect.bisect_right(ps, 9, lo=0, hi=len(ps))) # 二分右边界
bisect.insort_left(ps, T, lo=0, hi=len(ps)) # 二分插入到左侧
bisect.insort_right(ps, T, lo=0, hi=len(ps)) # 二分插入到右侧# 当时自我实现的,找到右边第一个比target的大的值的位置
def find_next_greater_o(arr, target):"""相当于bisect_right"""low, high = 0, len(arr) - 1try:if target < arr[low]:return -1except Exception as e:return -1while low <= high:mid = low + (high - low) // 2if arr[mid] <= target:low = mid + 1else:if arr[mid - 1] <= target:return midhigh = mid - 1return -1# 利用bisect实现
def find_next_greater(arr, target):try:i = bisect_right(arr, target, lo=0, hi=len(arr))if i == len(arr):return -1return iexcept Exception as e:print(f"Error [{e}] happened when finding the [{target}], return default")return -1def find_gt(a, x):# Find leftmost value greater thani = bisect_right(a, x)if i != len(a):return ireturn -1arr = [3, 4, 6, 7]
target = 5# print(find_next_greater(arr, target))
# print(find_gt(arr, target))
# print(find_next_greater(arr, 'fd'))
# print(find_next_greater(arr, 999))
# print(find_gt(arr, 999))# def grade(score, breakpoints=[60, 70, 80, 90], grades='FDCBA'):
# i = bisect(breakpoints, score)
# return grades[i]
#
# a = [grade(score) for score in [33, 99, 77, 70, 89, 90, 100]]
# ['F', 'A', 'C', 'C', 'B', 'A', 'A']
应用:数据分箱
# 数据分箱
def p_cut(bins: List[int], val: int, grades=None) -> int:"""return the result of discretization input val;左闭右开;:param bins: bins for describe:param val: value to set:param grades: customize the output"""if grades is not None:assert len(bins) == len(grades)grades = gradeselse:grades = reduce(lambda x, y: str(x) + str(y),[i for i in range(len(bins) + 1)])# try if for cover typeerrortry:i = bisect_right(bins, val)return int(grades[i])except TypeError as e:print(f"TypeError: {e} for input {val}")return -1print("*" * 10 + "Discretization test" + "*" * 10)
print(p_cut([1, 10, 20, 50, 100], val=5))
print(p_cut([1, 10, 20, 50, 100], val=1))
print(p_cut([1, 10, 20, 50, 100], val=10))
print(p_cut([1, 10, 20, 50, 100], val=500))
print(p_cut([1, 10, 20, 50, 100], val=""))bins = [1, 10, 20, 50, 100]
test_arr = [-100, None, 1, 5, 30, 500]
test_arr1 = [-100, 1, 5, 30, 500]
print([p_cut(bins, v) for v in test_arr])# 同样的功能利用pandas cut实现
# pandas cut 不支持val是其他类型,会直接报typeerror;
# pandas cut 对于不在区间内的值做NaN处理
print(pd.cut(test_arr1, bins=bins, labels=['a', 'b', 'c', 'd']))# 在大数据情况下,pandas cut可能不够高效,同样的功能还可以使用sklearn实现
data = np.array(test_arr1).reshape(-1, 1) # 重塑数据为2D,因为 Scikit-Learn 需要2D数组
est = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')
est.fit(data)
bins_sk = est.bin_edges_[0] # 获取边界
print(bins_sk)# tensorflow实现
## tf.keras.layers.Discretization
## tf.keras.layers.Discretization 左闭右开
"""在tf1 中是使用 tf.feature_column.bucketized_column
但是在tf2 中官方推荐的是tf.keras.layers.Discretization
"""
discretization_layer = tf.keras.layers.Discretization(bin_boundaries=bins)
print(f"tf.keras output:{discretization_layer(test_arr1)}")## tf.searchsorted or torch.searchsorted(bins, values, right=True) 左开右闭
print(tf.reshape(tf.searchsorted(bins, test_arr1), (-1, len(test_arr1))))bins_tensor = torch.tensor(bins, dtype=torch.float32)
values = torch.tensor(test_arr1, dtype=torch.float32)
indices = torch.searchsorted(bins_tensor, values, right=False)# 对比
print("*" * 10)
print(f"self code output:{[p_cut(bins, v) for v in test_arr1]}")
print(f"pd.cut output: {pd.cut(test_arr1, bins=bins)}")
print(f"tf.keras output:{discretization_layer(test_arr1)}")
print(f"torch.searchsorted output:{indices}")
print(f"tf.searchsorted output :{tf.searchsorted(bins, test_arr1)}")
print("*" * 10 + "Discretization test finished" + "*" * 10)