敏感词过滤方案
敏感词过滤方案
常见算法:Trie树算法 和 DFA算法
Trie树(字典树、单词查找树)
应用场景:字符串匹配,解决一组字符串集合中快速找到某个字符串的问题。例如:浏览器的搜索关键词提示就可以基于Trie树做。
思路:通过公共前缀来提高字符串匹配效率
Trie树缺陷:空间换时间,占用的内存大。
改进:使用双数组Trie树 -DAT(改进版Trie树)。
DAT树优点:内存占用极低,可以达到Trie树内存的1%左右。
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;public class DoubleArrayTrie {private final static int BUF_SIZE = 16384; // 缓冲区大小private final static int UNIT_SIZE = 8; // 单位大小private static class Node {int code;int depth;int left;int right;};private int check[]; // 存储 节点的父子关系private int base[]; // 存储 转移状态 和 位置private boolean used[]; // 记录数据位置是否被占用private int size;private int allocSize;private List<String> key; // 存储插入的字符串列表private int keySize; // 字符串长度private int length[]; // 长度数组private int value[]; // 对应值的数组// 用于构建 Trie 的过程控制private int progress; private int nextCheckPos;// boolean no_delete_;int error_;/*** 用于扩展 base 和 check 数组的大小,以适应新增节点。*/ private int resize(int newSize) {int[] base2 = new int[newSize];int[] check2 = new int[newSize];boolean used2[] = new boolean[newSize];if (allocSize > 0) {System.arraycopy(base, 0, base2, 0, allocSize);System.arraycopy(check, 0, check2, 0, allocSize);System.arraycopy(used2, 0, used2, 0, allocSize);}base = base2;check = check2;used = used2;return allocSize = newSize;}/*** 从父节点 parent 获取兄弟节点 siblings,用于建立节点的连接关系。* 通过遍历 key,根据字符编码计算每个节点的转移关系。*/private int fetch(Node parent, List<Node> siblings) {if (error_ < 0)return 0;int prev = 0;for (int i = parent.left; i < parent.right; i++) {if ((length != null ? length[i] : key.get(i).length()) < parent.depth)continue;String tmp = key.get(i);int cur = 0;if ((length != null ? length[i] : tmp.length()) != parent.depth)cur = (int) tmp.charAt(parent.depth) + 1;if (prev > cur) {error_ = -3;return 0;}if (cur != prev || siblings.size() == 0) {Node tmp_node = new Node();tmp_node.depth = parent.depth + 1;tmp_node.code = cur;tmp_node.left = i;if (siblings.size() != 0)siblings.get(siblings.size() - 1).right = i;siblings.add(tmp_node);}prev = cur;}if (siblings.size() != 0)siblings.get(siblings.size() - 1).right = parent.right;return siblings.size();}/*** 递归插入兄弟节点 siblings 到双数组中。* 计算节点的 begin 位置,调整 nextCheckPos,并为每个兄弟节点设置 check 和 base 数组值。*/private int insert(List<Node> siblings) {if (error_ < 0)return 0;int begin = 0;int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1: nextCheckPos) - 1;int nonzero_num = 0;int first = 0;if (allocSize <= pos)resize(pos + 1);outer: while (true) {pos++;if (allocSize <= pos)resize(pos + 1);if (check[pos] != 0) {nonzero_num++;continue;} else if (first == 0) {nextCheckPos = pos;first = 1;}begin = pos - siblings.get(0).code;if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {// progress can be zerodouble l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0* keySize / (progress + 1);resize((int) (allocSize * l));}if (used[begin])continue;for (int i = 1; i < siblings.size(); i++)if (check[begin + siblings.get(i).code] != 0)continue outer;break;}// -- Simple heuristics --// if the percentage of non-empty contents in check between the// index// 'next_check_pos' and 'check' is greater than some constant value// (e.g. 0.9),// new 'next_check_pos' index is written by 'check'.if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)nextCheckPos = pos;used[begin] = true;size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size: begin + siblings.get(siblings.size() - 1).code + 1;for (int i = 0; i < siblings.size(); i++)check[begin + siblings.get(i).code] = begin;for (int i = 0; i < siblings.size(); i++) {List<Node> new_siblings = new ArrayList<Node>();if (fetch(siblings.get(i), new_siblings) == 0) {base[begin + siblings.get(i).code] = (value != null) ? (-value[siblings.get(i).left] - 1) : (-siblings.get(i).left - 1);if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {error_ = -2;return 0;}progress++;// if (progress_func_) (*progress_func_) (progress,// keySize);} else {int h = insert(new_siblings);base[begin + siblings.get(i).code] = h;}}return begin;}public DoubleArrayTrie() {check = null;base = null;used = null;size = 0;allocSize = 0;// no_delete_ = false;error_ = 0;}// no deconstructor// set_result omitted// the search methods returns (the list of) the value(s) instead// of (the list of) the pair(s) of value(s) and length(s)// set_array omitted// array omittedvoid clear() {// if (! no_delete_)check = null;base = null;used = null;allocSize = 0;size = 0;// no_delete_ = false;}public int getUnitSize() {return UNIT_SIZE;}public int getSize() {return size;}public int getTotalSize() {return size * UNIT_SIZE;}public int getNonzeroSize() {int result = 0;for (int i = 0; i < size; i++)if (check[i] != 0)result++;return result;}public int build(List<String> key) {return build(key, null, null, key.size());}/*** 使用字符串 key 构建 Trie。* 调用 fetch() 方法获取根节点的兄弟节点,然后调用 insert() 方法递归构建。*/public int build(List<String> _key, int _length[], int _value[],int _keySize) {if (_keySize > _key.size() || _key == null)return 0;// progress_func_ = progress_func;key = _key;length = _length;keySize = _keySize;value = _value;progress = 0;resize(65536 * 32);base[0] = 1;nextCheckPos = 0;Node root_node = new Node();root_node.left = 0;root_node.right = keySize;root_node.depth = 0;List<Node> siblings = new ArrayList<Node>();fetch(root_node, siblings);insert(siblings);// size += (1 << 8 * 2) + 1; // ???// if (size >= allocSize) resize (size);used = null;key = null;return error_;}/*** open():从文件读取 base 和 check 数组,加载已有的 Trie 结构。*/public void open(String fileName) throws IOException {File file = new File(fileName);size = (int) file.length() / UNIT_SIZE;check = new int[size];base = new int[size];DataInputStream is = null;try {is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUF_SIZE));for (int i = 0; i < size; i++) {base[i] = is.readInt();check[i] = is.readInt();}} finally {if (is != null)is.close();}}/*** save():将 base 和 check 数组保存到文件。*/public void save(String fileName) throws IOException {DataOutputStream out = null;try {out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName)));for (int i = 0; i < size; i++) {out.writeInt(base[i]);out.writeInt(check[i]);}out.close();} finally {if (out != null)out.close();}}/*** 执行字符串的精确匹配搜索,返回匹配的索引值。* 按照字符逐个查找 Trie 节点,直到找到终结节点。*/public int exactMatchSearch(String key) {return exactMatchSearch(key, 0, 0, 0);}public int exactMatchSearch(String key, int pos, int len, int nodePos) {if (len <= 0)len = key.length();if (nodePos <= 0)nodePos = 0;int result = -1;char[] keyChars = key.toCharArray();int b = base[nodePos];int p;for (int i = pos; i < len; i++) {p = b + (int) (keyChars[i]) + 1;if (b == check[p])b = base[p];elsereturn result;}p = b;int n = base[p];if (b == check[p] && n < 0) {result = -n - 1;}return result;}/*** 进行公共前缀搜索,返回匹配的所有索引。* 类似于 exactMatchSearch,但会收集所有的匹配前缀索引。*/public List<Integer> commonPrefixSearch(String key) {return commonPrefixSearch(key, 0, 0, 0);}public List<Integer> commonPrefixSearch(String key, int pos, int len,int nodePos) {if (len <= 0)len = key.length();if (nodePos <= 0)nodePos = 0;List<Integer> result = new ArrayList<Integer>();char[] keyChars = key.toCharArray();int b = base[nodePos];int n;int p;for (int i = pos; i < len; i++) {p = b;n = base[p];if (b == check[p] && n < 0) {result.add(-n - 1);}p = b + (int) (keyChars[i]) + 1;if (b == check[p])b = base[p];elsereturn result;}p = b;n = base[p];if (b == check[p] && n < 0) {result.add(-n - 1);}return result;}
}
AC自动机
AC基于Trie树改进,是一种多模式匹配算法,AC自动机使用Trie树来存放模式串的前缀,通过失败匹配指针(失配指针)来处理匹配失败的跳转。AC自动机详细介绍
使用DAT来表示AC自动机
DFA算法
DFA(Deterministic Finite Automata)即确定有穷自动机,与之对应的是 NFA(Non-Deterministic Finite Automata,不确定有穷自动机)。
关于 DFA 的详细介绍可以看这篇文章:有穷自动机 DFA&NFA (学习笔记) - 小蜗牛的文章 - 知乎 。
数据脱敏方案
概念:对默写敏感信息通过脱敏规则进行数据的变形
适用数据:身份证号码、手机号、卡号、客户号、密码等等
对于身份证号码,采用掩码算法将前几位数据保留,其他为使用 X 或 ***** 替代;
对于姓名,可以使用伪造算法。将真实姓名替换成随机生成的假名。
常用脱敏规则
-
替换(常用):将敏感数据中的特定字符或字符序列替换为其他字符。例如,将信用卡中的中间几位数据替换为 ***** 或其他字符。
-
删除:将敏感数据中的部分内容随机删除。比如电话号码的随机3位数据。
-
重排:将原始数据中的某些字段或字段的顺序打乱。例如:将身份证号码随机位交错互换。
-
加噪:在数据中注入一些误差或者噪音,达到对数据脱敏的效果。例如在敏感数据中添加一些随机生成的字符。
-
加密算法加密
常用脱敏工具
Hutool
Hutool 一个 Java 基础工具类,对文件、流、加密解密、转码、正则、线程、XML 等 JDK 方法进行封装,组成各种 Util 工具类。
import cn.hutool.core.util.DesensitizedUtil;
import org.junit.Test;
import org.springframework.boot.test.context.Spring BootTest;
public class HuToolDesensitizationTest {public void testPhoneDesensitization(){String phone="13723231234";System.out.println(DesensitizedUtil.mobilePhone(phone)); //输出:137****1234}
}
MyBatis-Flex
MyBatis-Flex 提供了 @ColumnMask()
注解,以及内置的 9 种脱敏规则,开箱即用:
/*** 内置的数据脱敏方式*/
public class Masks {/*** 手机号脱敏*/public static final String MOBILE = "mobile";/*** 固定电话脱敏*/public static final String FIXED_PHONE = "fixed_phone";/*** 身份证号脱敏*/public static final String ID_CARD_NUMBER = "id_card_number";/*** 中文名脱敏*/public static final String CHINESE_NAME = "chinese_name";/*** 地址脱敏*/public static final String ADDRESS = "address";/*** 邮件脱敏*/public static final String EMAIL = "email";/*** 密码脱敏*/public static final String PASSWORD = "password";/*** 车牌号脱敏*/public static final String CAR_LICENSE = "car_license";/*** 银行卡号脱敏*/public static final String BANK_CARD_NUMBER = "bank_card_number";//...
}
使用示例
@Table("tb_account")
public class Account {@Id(keyType = KeyType.Auto)private Long id;@ColumnMask(Masks.CHINESE_NAME)private String userName;@ColumnMask(Masks.EMAIL)private String email;}