前几天写了一篇关于哈希算法的文章,起源就是在构思AB实验平台的时候,用到了哈希,所以对其做了深入的了解
AB实验平台是一般互联网做策略、样式实验会用到的一个系统,一般开启某个实验之后,需要对线上流量进行分流:客户端->实验平台->策略平台->应用服务,大概是这个链路
场景
线上流量策略分流,需要对不同人群做策略实验,最终效果好的一组需要推全到100%,在未推全之前,100%的流量会被分到原有流量+几个实验组;
比如我有一个实验:样式迭代,产品希望60%的流量走线上逻辑,剩下的40%,均分到4个实验组,这样可以细致的比对样式的效果,当然,如果一个用户命中了实验组A,那么他后续的行为,在实验组不调整的情况下,需要一直保持(有的人说希望调整了也保持,但是真的了解AB分流逻辑之后,真挺难的);
其实我们有自己的AB平台,但是我在想如果让我去实现,我会怎么做,便有了这一篇文章;
考虑的问题
1.如何保证一个用户对实验命中唯一性
在实验组不调整的情况下,PM希望用户分流的结果始终保持一致;
【策略】:hash算法,通过对imei进行哈希运算得到int的哈希值即可,可以加一个时间戳(实验修改时间),来控制用户的imei命中变化
2.如何按比例分流
【策略】取模运算,构建一个长度为100的bucket,然后通过对imei的哈希值运算结果对100取模,即可得到0-100的数,从而去命中数据区间,挑选实验组即可;由于哈希算法的特性,只要imei不变,时间戳(因子)不变,计算得到的哈希值是永远不会变的,所以取模结果,命中实验的结果也是不变的;
比如我的case是线上流量60%,其他实验组10%,10%,10%,10%,则可以构建一个数据区间:【0,60,70,80,90,100】
实战
代码
@Data
public class TrafficDistributor {private String id;// 实验idprivate String name;private double isolation;// 线上流量private List<Experiment> experiments;// 实验组private Table<String, String, Set<String>> whiteTable = HashBasedTable.create();private static Date updateTime = new Date(1719844946L);//2024/7/1 22:42:26public TrafficDistributor(String id, String name, double isolation, List<Experiment> experiments, Table<String, String, Set<String>> whiteTable) {this.id = id;this.name = name;this.isolation = isolation;this.experiments = experiments;this.whiteTable = whiteTable;}public static void main(String[] args) {// 实验idString id = "new_style";// 1.初始化白名单==》正式环境从db中拿出来初始化Table<String, String, Set<String>> white = HashBasedTable.create();white.put(id, "00", Sets.newHashSetWithExpectedSize(0));white.put(id, "01", Set.of("whitelist_imei_1", "whitelist_imei_2"));white.put(id, "02", Set.of("whitelist_imei_3", "whitelist_imei_4"));white.put(id, "11", Set.of("whitelist_imei_5", "whitelist_imei_6"));white.put(id, "12", Set.of("whitelist_imei_7", "whitelist_imei_8"));// 2.初始化实验组Experiment experiment1 = new Experiment("01", "对照组01", 10);Experiment experiment2 = new Experiment("02", "对照组02", 10);Experiment experiment3 = new Experiment("11", "实验组01", 10);Experiment experiment4 = new Experiment("12", "实验组02", 10);List<Experiment> experiments = List.of(experiment1, experiment2, experiment3, experiment4);// 3.初始化分流器TrafficDistributor distributor = new TrafficDistributor(id, "新样式实验", 60, experiments, white);// 4.模拟分流for (int times = 0; times < 100; times++) {int index = 1, size = 100;Map<String, Integer> result = Maps.newHashMap();List<Long> cost = Lists.newArrayListWithCapacity(size);while (index <= size) {String imei = RandomStringUtils.randomAlphanumeric(64);long start = System.currentTimeMillis();// 核心分流Experiment experiment = distributor.allocate(id, imei);if (experiment != null) {// 实验组result.compute(experiment.getId(), (eid, count) -> (count == null) ? 1 : count + 1);} else {// 线上int value = result.getOrDefault("00", 0);result.put("00", value + 1);}cost.add(System.currentTimeMillis() - start);index++;}System.out.println("耗时:" + cost.stream().mapToLong(Long::longValue).sum() + ";结果:" + JSON.toJSONString(result));}}/*** 流量分配: 白名单 - 实验组 - 线上流量*/public Experiment allocate(String id, String imei) {Experiment whiteListShot = whiteListShot(id, imei);if (whiteListShot != null) {return whiteListShot;}List<Double> weights = experiments.stream().map(Experiment::getWeight).collect(Collectors.toList());int groupId = distributeTraffic(weights, imei);if (groupId < 0) {// 线上流量return null;}// 实验组流量return experiments.get(groupId);}/*** 根据每个实验组的权重配比,判断最终流量应该分配到哪个实验组。** @param weights 每个实验组的权重值数组。* @param imei imei* @return 分配流量的实验组索引*/public int distributeTraffic(List<Double> weights, String imei) {if (CollectionUtils.isEmpty(weights)) {return -1;}double totalWeight = 100;// 总权重100%double testWeight = weights.stream().mapToDouble(Double::doubleValue).sum();// 实验组总权重if (totalWeight != (isolation + testWeight)) {throw new IllegalArgumentException("实验组和对照组流量分配存在问题");}// imei+时间戳(因子)生成hashint hash = HashUtils.hashcode(imei + updateTime.getTime());return bucketShot(weights, hash % totalWeight);}/*** 哈希值进行取模定位后,命中的实验*/public int bucketShot(List<Double> weights, double bucketIndex) {List<Double> range = Lists.newArrayListWithCapacity(weights.size() + 1);range.add(isolation);double pre = range.get(0);for (double weight : weights) {range.add(pre + weight);pre = pre + weight;}for (int i = 0; i < range.size(); i++) {if (bucketIndex <= range.get(i)) {return i - 1;}}throw new IllegalArgumentException("实验组和对照组流量分配存在问题");}/*** 白名单命中** @param id 实验id* @param imei imei* @return 实验组*/public Experiment whiteListShot(String id, String imei) {assert whiteTable != null && !whiteTable.isEmpty();if (!whiteTable.containsRow(id)) {throw new IllegalArgumentException("实验id=" + id + "不存在");}Map<String, Set<String>> experimentData = whiteTable.row(id);for (Map.Entry<String, Set<String>> entry : experimentData.entrySet()) {Set<String> values = entry.getValue();if (!CollectionUtils.isEmpty(values) && values.contains(imei)) {String key = entry.getKey();return experiments.stream().filter(test -> test.getId().equals(key)).findFirst().orElse(null);}}return null;}// 其他方法保持不变@Data@AllArgsConstructorstatic class Experiment {private String id;private String name;private double weight;}
public class HashUtils {/*** hashCode方法*/public static int hashcode(Object obj) {final int p = 16777619;int hash = (int) 2166136261L;String str = obj.toString();for (int i = 0; i < str.length(); i++)hash = (hash ^ str.charAt(i)) * p;hash += hash << 13;hash ^= hash >> 7;hash += hash << 3;hash ^= hash >> 17;hash += hash << 5;if (hash < 0)hash = Math.abs(hash);return hash;}
}
分析
耗时:34;结果:{"11":11,"00":61,"01":7,"12":9,"02":12}
耗时:4;结果:{"11":8,"00":61,"12":4,"01":9,"02":18}
耗时:2;结果:{"11":9,"00":60,"01":10,"12":7,"02":14}
耗时:2;结果:{"11":11,"00":54,"01":10,"12":14,"02":11}
耗时:1;结果:{"11":9,"00":65,"12":12,"01":8,"02":6}
耗时:3;结果:{"11":9,"00":61,"01":6,"12":12,"02":12}
耗时:2;结果:{"11":9,"00":56,"12":8,"01":17,"02":10}
耗时:2;结果:{"11":7,"00":61,"12":7,"01":10,"02":15}
耗时:1;结果:{"11":8,"00":58,"01":13,"12":4,"02":17}
耗时:0;结果:{"11":5,"00":72,"01":9,"12":5,"02":9}
耗时:0;结果:{"11":2,"00":70,"12":14,"01":8,"02":6}
耗时:2;结果:{"11":7,"00":63,"12":10,"01":10,"02":10}
耗时:0;结果:{"11":8,"00":60,"12":11,"01":10,"02":11}
耗时:1;结果:{"11":9,"00":60,"12":11,"01":13,"02":7}
耗时:0;结果:{"11":12,"00":71,"12":5,"01":9,"02":3}
耗时:1;结果:{"11":12,"00":57,"01":11,"12":11,"02":9}
耗时:0;结果:{"11":8,"00":62,"01":14,"12":7,"02":9}
耗时:1;结果:{"11":10,"00":64,"12":6,"01":9,"02":11}
耗时:2;结果:{"11":5,"00":73,"12":7,"01":9,"02":6}
耗时:0;结果:{"11":9,"00":68,"01":6,"12":9,"02":8}
耗时:1;结果:{"11":12,"00":63,"01":10,"12":4,"02":11}
耗时:1;结果:{"11":15,"00":59,"01":8,"12":9,"02":9}
耗时:0;结果:{"11":10,"00":66,"01":8,"12":6,"02":10}
耗时:1;结果:{"11":8,"00":64,"01":10,"12":6,"02":12}
耗时:2;结果:{"11":11,"00":63,"01":8,"12":8,"02":10}
耗时:1;结果:{"11":5,"00":66,"12":9,"01":12,"02":8}
耗时:3;结果:{"11":8,"00":67,"12":10,"01":9,"02":6}
耗时:1;结果:{"11":9,"00":54,"12":16,"01":7,"02":14}
耗时:0;结果:{"11":11,"00":63,"12":5,"01":11,"02":10}
耗时:1;结果:{"11":10,"00":59,"01":12,"12":8,"02":11}
耗时:1;结果:{"11":12,"00":62,"12":11,"01":8,"02":7}
耗时:0;结果:{"11":8,"00":59,"12":8,"01":13,"02":12}
耗时:1;结果:{"11":12,"00":51,"12":11,"01":15,"02":11}
耗时:1;结果:{"11":1,"00":72,"01":10,"12":8,"02":9}
耗时:1;结果:{"11":8,"00":55,"01":18,"12":9,"02":10}
耗时:0;结果:{"11":6,"00":54,"01":22,"12":5,"02":13}
耗时:1;结果:{"11":9,"00":58,"12":11,"01":11,"02":11}
耗时:1;结果:{"11":15,"00":57,"12":12,"01":3,"02":13}
耗时:1;结果:{"11":6,"00":66,"12":10,"01":11,"02":7}
耗时:0;结果:{"11":13,"00":61,"12":12,"01":8,"02":6}
耗时:0;结果:{"11":8,"00":61,"12":11,"01":10,"02":10}
耗时:0;结果:{"11":10,"00":57,"01":10,"12":12,"02":11}
耗时:1;结果:{"11":6,"00":62,"12":12,"01":11,"02":9}
耗时:5;结果:{"11":9,"00":64,"12":8,"01":10,"02":9}
耗时:0;结果:{"11":15,"00":54,"12":8,"01":9,"02":14}
耗时:0;结果:{"11":12,"00":62,"01":8,"12":10,"02":8}
耗时:0;结果:{"11":9,"00":63,"12":6,"01":12,"02":10}
耗时:0;结果:{"11":9,"00":59,"01":10,"12":9,"02":13}
耗时:1;结果:{"11":10,"00":58,"01":7,"12":14,"02":11}
耗时:1;结果:{"11":9,"00":73,"01":8,"12":2,"02":8}
耗时:0;结果:{"11":14,"00":62,"12":7,"01":10,"02":7}
耗时:1;结果:{"11":12,"00":55,"12":10,"01":12,"02":11}
耗时:0;结果:{"11":5,"00":59,"01":7,"12":17,"02":12}
耗时:0;结果:{"11":10,"00":59,"12":7,"01":10,"02":14}
耗时:1;结果:{"11":11,"00":54,"01":17,"12":8,"02":10}
耗时:0;结果:{"11":8,"00":65,"12":8,"01":9,"02":10}
耗时:1;结果:{"11":13,"00":61,"12":8,"01":9,"02":9}
耗时:1;结果:{"11":6,"00":67,"01":10,"12":11,"02":6}
耗时:1;结果:{"11":7,"00":61,"12":8,"01":10,"02":14}
耗时:0;结果:{"11":6,"00":63,"12":8,"01":10,"02":13}
耗时:0;结果:{"11":9,"00":62,"12":9,"01":9,"02":11}
耗时:1;结果:{"11":5,"00":65,"01":8,"12":11,"02":11}
耗时:0;结果:{"11":11,"00":52,"12":9,"01":15,"02":13}
耗时:0;结果:{"11":14,"00":66,"01":4,"12":7,"02":9}
耗时:0;结果:{"11":12,"00":54,"12":8,"01":8,"02":18}
耗时:1;结果:{"11":9,"00":64,"12":8,"01":10,"02":9}
耗时:1;结果:{"11":9,"00":65,"01":3,"12":11,"02":12}
耗时:0;结果:{"11":5,"00":67,"01":7,"12":12,"02":9}
耗时:0;结果:{"11":13,"00":50,"01":12,"12":11,"02":14}
耗时:1;结果:{"11":18,"00":55,"12":7,"01":10,"02":10}
耗时:0;结果:{"11":5,"00":64,"01":12,"12":5,"02":14}
耗时:0;结果:{"11":10,"00":68,"12":6,"01":7,"02":9}
耗时:1;结果:{"11":9,"00":71,"12":4,"01":6,"02":10}
耗时:0;结果:{"11":8,"00":62,"12":9,"01":9,"02":12}
耗时:0;结果:{"11":10,"00":64,"12":8,"01":9,"02":9}
耗时:0;结果:{"11":9,"00":57,"12":10,"01":9,"02":15}
耗时:0;结果:{"11":10,"00":60,"12":13,"01":9,"02":8}
耗时:0;结果:{"11":12,"00":66,"01":5,"12":9,"02":8}
耗时:0;结果:{"11":6,"00":58,"01":11,"12":13,"02":12}
耗时:1;结果:{"11":10,"00":62,"01":12,"12":9,"02":7}
耗时:1;结果:{"11":7,"00":66,"12":11,"01":7,"02":9}
耗时:0;结果:{"11":10,"00":63,"12":9,"01":11,"02":7}
耗时:1;结果:{"11":8,"00":61,"12":10,"01":12,"02":9}
耗时:0;结果:{"11":8,"00":62,"12":6,"01":10,"02":14}
耗时:0;结果:{"11":7,"00":68,"12":8,"01":11,"02":6}
耗时:0;结果:{"11":11,"00":54,"01":11,"12":16,"02":8}
耗时:0;结果:{"11":7,"00":68,"01":10,"12":6,"02":9}
耗时:0;结果:{"11":7,"00":65,"12":7,"01":8,"02":13}
耗时:0;结果:{"11":8,"00":69,"01":8,"12":5,"02":10}
耗时:0;结果:{"11":15,"00":60,"01":6,"12":11,"02":8}
耗时:0;结果:{"11":9,"00":70,"01":6,"12":7,"02":8}
耗时:0;结果:{"11":14,"00":62,"12":7,"01":10,"02":7}
耗时:0;结果:{"11":11,"00":64,"12":7,"01":7,"02":11}
耗时:1;结果:{"11":6,"00":56,"12":14,"01":10,"02":14}
耗时:0;结果:{"11":7,"00":64,"12":8,"01":11,"02":10}
耗时:1;结果:{"11":14,"00":65,"12":4,"01":10,"02":7}
耗时:0;结果:{"11":13,"00":59,"12":7,"01":13,"02":8}
耗时:1;结果:{"11":5,"00":65,"12":8,"01":14,"02":8}
耗时:1;结果:{"11":12,"00":54,"01":11,"12":10,"02":13}
耗时:1;结果:{"11":10,"00":56,"12":11,"01":11,"02":12}
Process finished with exit code 0
- 我手写的hashCode方法,包括随机字符串生成的imei仍然存在实验组分类偏向的情况,中间尝试过使用一致性哈希解决偏移,但是又解决不了加权按比例分配的问题,不知道各位同仁有没有什么好的建议;我怀疑是随机字符串的偏向性问题,后面看看有没有办法解决
- 取模算法是个宝藏,既可以精准定位,也可以利用随机性,来做区间命中
- updateTime它是个变化因子,如果实验有新增实验组或者流量调整,可以利用它来控制imei实验组变化
参考资料
- https://zhuanlan.zhihu.com/p/404232432
- https://blog.csdn.net/SmartCodeTech/article/details/113698568?spm=1001.2101.3001.6650.2&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ECtr-2-113698568-blog-131205090.235%5Ev43%5Epc_blog_bottom_relevance_base8&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ECtr-2-113698568-blog-131205090.235%5Ev43%5Epc_blog_bottom_relevance_base8&utm_relevant_index=5