diff --git a/docs-en/chapter_array_and_linkedlist/array.md b/docs-en/chapter_array_and_linkedlist/array.md index b080c0d86..499562d0b 100755 --- a/docs-en/chapter_array_and_linkedlist/array.md +++ b/docs-en/chapter_array_and_linkedlist/array.md @@ -287,7 +287,7 @@ Accessing elements in an array is highly efficient, allowing us to randomly acce } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization"
@@ -469,7 +469,7 @@ It's important to note that due to the fixed length of an array, inserting an el } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -628,7 +628,7 @@ Please note that after deletion, the former last element becomes "meaningless," } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -852,7 +852,7 @@ In most programming languages, we can traverse an array either by using indices } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1020,7 +1020,7 @@ Because arrays are linear data structures, this operation is commonly referred t } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1230,7 +1230,7 @@ To expand an array, it's necessary to create a larger array and then copy the e } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_array_and_linkedlist/linked_list.md b/docs-en/chapter_array_and_linkedlist/linked_list.md index 49f861d0e..2857f6ac4 100755 --- a/docs-en/chapter_array_and_linkedlist/linked_list.md +++ b/docs-en/chapter_array_and_linkedlist/linked_list.md @@ -540,7 +540,7 @@ In contrast, the time complexity of inserting an element in an array is $O(n)$, } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -730,7 +730,7 @@ Note that although node `P` still points to `n1` after the deletion operation is } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -909,7 +909,7 @@ Note that although node `P` still points to `n1` after the deletion operation is } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1111,7 +1111,7 @@ Traverse the linked list to find a node with a value equal to `target`, and outp } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_array_and_linkedlist/list.md b/docs-en/chapter_array_and_linkedlist/list.md index e562b009a..789d90086 100755 --- a/docs-en/chapter_array_and_linkedlist/list.md +++ b/docs-en/chapter_array_and_linkedlist/list.md @@ -2119,7 +2119,7 @@ To enhance our understanding of how lists work, we will attempt to implement a s } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_computational_complexity/iteration_and_recursion.md b/docs-en/chapter_computational_complexity/iteration_and_recursion.md index d0005a962..ee661b9ef 100644 --- a/docs-en/chapter_computational_complexity/iteration_and_recursion.md +++ b/docs-en/chapter_computational_complexity/iteration_and_recursion.md @@ -182,7 +182,7 @@ The following function implements the sum $1 + 2 + \dots + n$ using a `for` loop } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -393,7 +393,7 @@ Below we use a `while` loop to implement the sum $1 + 2 + \dots + n$: } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -617,7 +617,7 @@ For example, in the following code, the condition variable $i$ is updated twice } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -836,7 +836,7 @@ We can nest one loop structure within another. Below is an example using `for` l } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1046,7 +1046,7 @@ Observe the following code, where calling the function `recur(n)` completes the } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1247,7 +1247,7 @@ For example, in calculating $1 + 2 + \dots + n$, we can make the result variable } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1460,7 +1460,7 @@ Using the recursive relation, and considering the first two numbers as terminati } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1783,7 +1783,7 @@ Therefore, **we can use an explicit stack to simulate the behavior of the call s } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_computational_complexity/space_complexity.md b/docs-en/chapter_computational_complexity/space_complexity.md index 337a7b9b9..97c92e202 100644 --- a/docs-en/chapter_computational_complexity/space_complexity.md +++ b/docs-en/chapter_computational_complexity/space_complexity.md @@ -1065,7 +1065,7 @@ Note that memory occupied by initializing variables or calling functions in a lo } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1334,7 +1334,7 @@ Linear order is common in arrays, linked lists, stacks, queues, etc., where the } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1480,7 +1480,7 @@ As shown below, this function's recursive depth is $n$, meaning there are $n$ in } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1705,7 +1705,7 @@ Quadratic order is common in matrices and graphs, where the number of elements i } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1869,7 +1869,7 @@ As shown below, the recursive depth of this function is $n$, and in each recursi } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2047,7 +2047,7 @@ Exponential order is common in binary trees. Observe the below image, a "full bi } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_computational_complexity/time_complexity.md b/docs-en/chapter_computational_complexity/time_complexity.md index 0af6f5def..8ca18e698 100644 --- a/docs-en/chapter_computational_complexity/time_complexity.md +++ b/docs-en/chapter_computational_complexity/time_complexity.md @@ -1119,7 +1119,7 @@ Constant order means the number of operations is independent of the input data s } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1276,7 +1276,7 @@ Linear order indicates the number of operations grows linearly with the input da } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1449,7 +1449,7 @@ Operations like array traversal and linked list traversal have a time complexity } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1651,7 +1651,7 @@ Quadratic order means the number of operations grows quadratically with the inpu } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -1936,7 +1936,7 @@ For instance, in bubble sort, the outer loop runs $n - 1$ times, and the inner l } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2169,7 +2169,7 @@ The following image and code simulate the cell division process, with a time com } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2309,7 +2309,7 @@ In practice, exponential order often appears in recursive functions. For example } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2491,7 +2491,7 @@ The following image and code simulate the "halving each round" process, with a t } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2631,7 +2631,7 @@ Like exponential order, logarithmic order also frequently appears in recursive f } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -2829,7 +2829,7 @@ Linear-logarithmic order often appears in nested loops, with the complexities of } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -3040,7 +3040,7 @@ Factorials are typically implemented using recursion. As shown in the image and } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" @@ -3407,7 +3407,7 @@ The "worst-case time complexity" corresponds to the asymptotic upper bound, deno } ``` -??? pythontutor "Visualizing Code" +??? pythontutor "Code Visualization" diff --git a/docs-en/chapter_hashing/hash_algorithm.md b/docs-en/chapter_hashing/hash_algorithm.md new file mode 100644 index 000000000..773a52994 --- /dev/null +++ b/docs-en/chapter_hashing/hash_algorithm.md @@ -0,0 +1,886 @@ +--- +comments: true +--- + +# 6.3 Hash Algorithms + +The previous two sections introduced the working principle of hash tables and the methods to handle hash collisions. However, both open addressing and chaining can **only ensure that the hash table functions normally when collisions occur, but cannot reduce the frequency of hash collisions**. + +If hash collisions occur too frequently, the performance of the hash table will deteriorate drastically. As shown in the Figure 6-8 , for a chaining hash table, in the ideal case, the key-value pairs are evenly distributed across the buckets, achieving optimal query efficiency; in the worst case, all key-value pairs are stored in the same bucket, degrading the time complexity to $O(n)$. + +![Ideal and Worst Cases of Hash Collisions](hash_algorithm.assets/hash_collision_best_worst_condition.png){ class="animation-figure" } + +Figure 6-8 Ideal and Worst Cases of Hash Collisions
+ +**The distribution of key-value pairs is determined by the hash function**. Recalling the steps of calculating a hash function, first compute the hash value, then modulo it by the array length: + +```shell +index = hash(key) % capacity +``` + +Observing the above formula, when the hash table capacity `capacity` is fixed, **the hash algorithm `hash()` determines the output value**, thereby determining the distribution of key-value pairs in the hash table. + +This means that, to reduce the probability of hash collisions, we should focus on the design of the hash algorithm `hash()`. + +## 6.3.1 Goals of Hash Algorithms + +To achieve a "fast and stable" hash table data structure, hash algorithms should have the following characteristics: + +- **Determinism**: For the same input, the hash algorithm should always produce the same output. Only then can the hash table be reliable. +- **High Efficiency**: The process of computing the hash value should be fast enough. The smaller the computational overhead, the more practical the hash table. +- **Uniform Distribution**: The hash algorithm should ensure that key-value pairs are evenly distributed in the hash table. The more uniform the distribution, the lower the probability of hash collisions. + +In fact, hash algorithms are not only used to implement hash tables but are also widely applied in other fields. + +- **Password Storage**: To protect the security of user passwords, systems usually do not store the plaintext passwords but rather the hash values of the passwords. When a user enters a password, the system calculates the hash value of the input and compares it with the stored hash value. If they match, the password is considered correct. +- **Data Integrity Check**: The data sender can calculate the hash value of the data and send it along; the receiver can recalculate the hash value of the received data and compare it with the received hash value. If they match, the data is considered intact. + +For cryptographic applications, to prevent reverse engineering such as deducing the original password from the hash value, hash algorithms need higher-level security features. + +- **Unidirectionality**: It should be impossible to deduce any information about the input data from the hash value. +- **Collision Resistance**: It should be extremely difficult to find two different inputs that produce the same hash value. +- **Avalanche Effect**: Minor changes in the input should lead to significant and unpredictable changes in the output. + +Note that **"Uniform Distribution" and "Collision Resistance" are two separate concepts**. Satisfying uniform distribution does not necessarily mean collision resistance. For example, under random input `key`, the hash function `key % 100` can produce a uniformly distributed output. However, this hash algorithm is too simple, and all `key` with the same last two digits will have the same output, making it easy to deduce a usable `key` from the hash value, thereby cracking the password. + +## 6.3.2 Design of Hash Algorithms + +The design of hash algorithms is a complex issue that requires consideration of many factors. However, for some less demanding scenarios, we can also design some simple hash algorithms. + +- **Additive Hash**: Add up the ASCII codes of each character in the input and use the total sum as the hash value. +- **Multiplicative Hash**: Utilize the non-correlation of multiplication, multiplying each round by a constant, accumulating the ASCII codes of each character into the hash value. +- **XOR Hash**: Accumulate the hash value by XORing each element of the input data. +- **Rotating Hash**: Accumulate the ASCII code of each character into a hash value, performing a rotation operation on the hash value before each accumulation. + +=== "Python" + + ```python title="simple_hash.py" + def add_hash(key: str) -> int: + """加法哈希""" + hash = 0 + modulus = 1000000007 + for c in key: + hash += ord(c) + return hash % modulus + + def mul_hash(key: str) -> int: + """乘法哈希""" + hash = 0 + modulus = 1000000007 + for c in key: + hash = 31 * hash + ord(c) + return hash % modulus + + def xor_hash(key: str) -> int: + """异或哈希""" + hash = 0 + modulus = 1000000007 + for c in key: + hash ^= ord(c) + return hash % modulus + + def rot_hash(key: str) -> int: + """旋转哈希""" + hash = 0 + modulus = 1000000007 + for c in key: + hash = (hash << 4) ^ (hash >> 28) ^ ord(c) + return hash % modulus + ``` + +=== "C++" + + ```cpp title="simple_hash.cpp" + /* 加法哈希 */ + int addHash(string key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (unsigned char c : key) { + hash = (hash + (int)c) % MODULUS; + } + return (int)hash; + } + + /* 乘法哈希 */ + int mulHash(string key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (unsigned char c : key) { + hash = (31 * hash + (int)c) % MODULUS; + } + return (int)hash; + } + + /* 异或哈希 */ + int xorHash(string key) { + int hash = 0; + const int MODULUS = 1000000007; + for (unsigned char c : key) { + hash ^= (int)c; + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + int rotHash(string key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (unsigned char c : key) { + hash = ((hash << 4) ^ (hash >> 28) ^ (int)c) % MODULUS; + } + return (int)hash; + } + ``` + +=== "Java" + + ```java title="simple_hash.java" + /* 加法哈希 */ + int addHash(String key) { + long hash = 0; + final int MODULUS = 1000000007; + for (char c : key.toCharArray()) { + hash = (hash + (int) c) % MODULUS; + } + return (int) hash; + } + + /* 乘法哈希 */ + int mulHash(String key) { + long hash = 0; + final int MODULUS = 1000000007; + for (char c : key.toCharArray()) { + hash = (31 * hash + (int) c) % MODULUS; + } + return (int) hash; + } + + /* 异或哈希 */ + int xorHash(String key) { + int hash = 0; + final int MODULUS = 1000000007; + for (char c : key.toCharArray()) { + hash ^= (int) c; + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + int rotHash(String key) { + long hash = 0; + final int MODULUS = 1000000007; + for (char c : key.toCharArray()) { + hash = ((hash << 4) ^ (hash >> 28) ^ (int) c) % MODULUS; + } + return (int) hash; + } + ``` + +=== "C#" + + ```csharp title="simple_hash.cs" + /* 加法哈希 */ + int AddHash(string key) { + long hash = 0; + const int MODULUS = 1000000007; + foreach (char c in key) { + hash = (hash + c) % MODULUS; + } + return (int)hash; + } + + /* 乘法哈希 */ + int MulHash(string key) { + long hash = 0; + const int MODULUS = 1000000007; + foreach (char c in key) { + hash = (31 * hash + c) % MODULUS; + } + return (int)hash; + } + + /* 异或哈希 */ + int XorHash(string key) { + int hash = 0; + const int MODULUS = 1000000007; + foreach (char c in key) { + hash ^= c; + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + int RotHash(string key) { + long hash = 0; + const int MODULUS = 1000000007; + foreach (char c in key) { + hash = ((hash << 4) ^ (hash >> 28) ^ c) % MODULUS; + } + return (int)hash; + } + ``` + +=== "Go" + + ```go title="simple_hash.go" + /* 加法哈希 */ + func addHash(key string) int { + var hash int64 + var modulus int64 + + modulus = 1000000007 + for _, b := range []byte(key) { + hash = (hash + int64(b)) % modulus + } + return int(hash) + } + + /* 乘法哈希 */ + func mulHash(key string) int { + var hash int64 + var modulus int64 + + modulus = 1000000007 + for _, b := range []byte(key) { + hash = (31*hash + int64(b)) % modulus + } + return int(hash) + } + + /* 异或哈希 */ + func xorHash(key string) int { + hash := 0 + modulus := 1000000007 + for _, b := range []byte(key) { + fmt.Println(int(b)) + hash ^= int(b) + hash = (31*hash + int(b)) % modulus + } + return hash & modulus + } + + /* 旋转哈希 */ + func rotHash(key string) int { + var hash int64 + var modulus int64 + + modulus = 1000000007 + for _, b := range []byte(key) { + hash = ((hash << 4) ^ (hash >> 28) ^ int64(b)) % modulus + } + return int(hash) + } + ``` + +=== "Swift" + + ```swift title="simple_hash.swift" + /* 加法哈希 */ + func addHash(key: String) -> Int { + var hash = 0 + let MODULUS = 1_000_000_007 + for c in key { + for scalar in c.unicodeScalars { + hash = (hash + Int(scalar.value)) % MODULUS + } + } + return hash + } + + /* 乘法哈希 */ + func mulHash(key: String) -> Int { + var hash = 0 + let MODULUS = 1_000_000_007 + for c in key { + for scalar in c.unicodeScalars { + hash = (31 * hash + Int(scalar.value)) % MODULUS + } + } + return hash + } + + /* 异或哈希 */ + func xorHash(key: String) -> Int { + var hash = 0 + let MODULUS = 1_000_000_007 + for c in key { + for scalar in c.unicodeScalars { + hash ^= Int(scalar.value) + } + } + return hash & MODULUS + } + + /* 旋转哈希 */ + func rotHash(key: String) -> Int { + var hash = 0 + let MODULUS = 1_000_000_007 + for c in key { + for scalar in c.unicodeScalars { + hash = ((hash << 4) ^ (hash >> 28) ^ Int(scalar.value)) % MODULUS + } + } + return hash + } + ``` + +=== "JS" + + ```javascript title="simple_hash.js" + /* 加法哈希 */ + function addHash(key) { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = (hash + c.charCodeAt(0)) % MODULUS; + } + return hash; + } + + /* 乘法哈希 */ + function mulHash(key) { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = (31 * hash + c.charCodeAt(0)) % MODULUS; + } + return hash; + } + + /* 异或哈希 */ + function xorHash(key) { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash ^= c.charCodeAt(0); + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + function rotHash(key) { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = ((hash << 4) ^ (hash >> 28) ^ c.charCodeAt(0)) % MODULUS; + } + return hash; + } + ``` + +=== "TS" + + ```typescript title="simple_hash.ts" + /* 加法哈希 */ + function addHash(key: string): number { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = (hash + c.charCodeAt(0)) % MODULUS; + } + return hash; + } + + /* 乘法哈希 */ + function mulHash(key: string): number { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = (31 * hash + c.charCodeAt(0)) % MODULUS; + } + return hash; + } + + /* 异或哈希 */ + function xorHash(key: string): number { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash ^= c.charCodeAt(0); + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + function rotHash(key: string): number { + let hash = 0; + const MODULUS = 1000000007; + for (const c of key) { + hash = ((hash << 4) ^ (hash >> 28) ^ c.charCodeAt(0)) % MODULUS; + } + return hash; + } + ``` + +=== "Dart" + + ```dart title="simple_hash.dart" + /* 加法哈希 */ + int addHash(String key) { + int hash = 0; + final int MODULUS = 1000000007; + for (int i = 0; i < key.length; i++) { + hash = (hash + key.codeUnitAt(i)) % MODULUS; + } + return hash; + } + + /* 乘法哈希 */ + int mulHash(String key) { + int hash = 0; + final int MODULUS = 1000000007; + for (int i = 0; i < key.length; i++) { + hash = (31 * hash + key.codeUnitAt(i)) % MODULUS; + } + return hash; + } + + /* 异或哈希 */ + int xorHash(String key) { + int hash = 0; + final int MODULUS = 1000000007; + for (int i = 0; i < key.length; i++) { + hash ^= key.codeUnitAt(i); + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + int rotHash(String key) { + int hash = 0; + final int MODULUS = 1000000007; + for (int i = 0; i < key.length; i++) { + hash = ((hash << 4) ^ (hash >> 28) ^ key.codeUnitAt(i)) % MODULUS; + } + return hash; + } + ``` + +=== "Rust" + + ```rust title="simple_hash.rs" + /* 加法哈希 */ + fn add_hash(key: &str) -> i32 { + let mut hash = 0_i64; + const MODULUS: i64 = 1000000007; + + for c in key.chars() { + hash = (hash + c as i64) % MODULUS; + } + + hash as i32 + } + + /* 乘法哈希 */ + fn mul_hash(key: &str) -> i32 { + let mut hash = 0_i64; + const MODULUS: i64 = 1000000007; + + for c in key.chars() { + hash = (31 * hash + c as i64) % MODULUS; + } + + hash as i32 + } + + /* 异或哈希 */ + fn xor_hash(key: &str) -> i32 { + let mut hash = 0_i64; + const MODULUS: i64 = 1000000007; + + for c in key.chars() { + hash ^= c as i64; + } + + (hash & MODULUS) as i32 + } + + /* 旋转哈希 */ + fn rot_hash(key: &str) -> i32 { + let mut hash = 0_i64; + const MODULUS: i64 = 1000000007; + + for c in key.chars() { + hash = ((hash << 4) ^ (hash >> 28) ^ c as i64) % MODULUS; + } + + hash as i32 + } + ``` + +=== "C" + + ```c title="simple_hash.c" + /* 加法哈希 */ + int addHash(char *key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (int i = 0; i < strlen(key); i++) { + hash = (hash + (unsigned char)key[i]) % MODULUS; + } + return (int)hash; + } + + /* 乘法哈希 */ + int mulHash(char *key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (int i = 0; i < strlen(key); i++) { + hash = (31 * hash + (unsigned char)key[i]) % MODULUS; + } + return (int)hash; + } + + /* 异或哈希 */ + int xorHash(char *key) { + int hash = 0; + const int MODULUS = 1000000007; + + for (int i = 0; i < strlen(key); i++) { + hash ^= (unsigned char)key[i]; + } + return hash & MODULUS; + } + + /* 旋转哈希 */ + int rotHash(char *key) { + long long hash = 0; + const int MODULUS = 1000000007; + for (int i = 0; i < strlen(key); i++) { + hash = ((hash << 4) ^ (hash >> 28) ^ (unsigned char)key[i]) % MODULUS; + } + + return (int)hash; + } + ``` + +=== "Zig" + + ```zig title="simple_hash.zig" + [class]{}-[func]{addHash} + + [class]{}-[func]{mulHash} + + [class]{}-[func]{xorHash} + + [class]{}-[func]{rotHash} + ``` + +??? pythontutor "Code Visualization" + + + + +It is observed that the last step of each hash algorithm is to take the modulus of the large prime number $1000000007$ to ensure that the hash value is within an appropriate range. It is worth pondering why emphasis is placed on modulo a prime number, or what are the disadvantages of modulo a composite number? This is an interesting question. + +To conclude: **Using a large prime number as the modulus can maximize the uniform distribution of hash values**. Since a prime number does not share common factors with other numbers, it can reduce the periodic patterns caused by the modulo operation, thus avoiding hash collisions. + +For example, suppose we choose the composite number $9$ as the modulus, which can be divided by $3$, then all `key` divisible by $3$ will be mapped to hash values $0$, $3$, $6$. + +$$ +\begin{aligned} +\text{modulus} & = 9 \newline +\text{key} & = \{ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, \dots \} \newline +\text{hash} & = \{ 0, 3, 6, 0, 3, 6, 0, 3, 6, 0, 3, 6,\dots \} +\end{aligned} +$$ + +If the input `key` happens to have this kind of arithmetic sequence distribution, then the hash values will cluster, thereby exacerbating hash collisions. Now, suppose we replace `modulus` with the prime number $13$, since there are no common factors between `key` and `modulus`, the uniformity of the output hash values will be significantly improved. + +$$ +\begin{aligned} +\text{modulus} & = 13 \newline +\text{key} & = \{ 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, \dots \} \newline +\text{hash} & = \{ 0, 3, 6, 9, 12, 2, 5, 8, 11, 1, 4, 7, \dots \} +\end{aligned} +$$ + +It is worth noting that if the `key` is guaranteed to be randomly and uniformly distributed, then choosing a prime number or a composite number as the modulus can both produce uniformly distributed hash values. However, when the distribution of `key` has some periodicity, modulo a composite number is more likely to result in clustering. + +In summary, we usually choose a prime number as the modulus, and this prime number should be large enough to eliminate periodic patterns as much as possible, enhancing the robustness of the hash algorithm. + +## 6.3.3 Common Hash Algorithms + +It is not hard to see that the simple hash algorithms mentioned above are quite "fragile" and far from reaching the design goals of hash algorithms. For example, since addition and XOR obey the commutative law, additive hash and XOR hash cannot distinguish strings with the same content but in different order, which may exacerbate hash collisions and cause security issues. + +In practice, we usually use some standard hash algorithms, such as MD5, SHA-1, SHA-2, and SHA-3. They can map input data of any length to a fixed-length hash value. + +Over the past century, hash algorithms have been in a continuous process of upgrading and optimization. Some researchers strive to improve the performance of hash algorithms, while others, including hackers, are dedicated to finding security issues in hash algorithms. The Table 6-2 shows hash algorithms commonly used in practical applications. + +- MD5 and SHA-1 have been successfully attacked multiple times and are thus abandoned in various security applications. +- SHA-2 series, especially SHA-256, is one of the most secure hash algorithms to date, with no successful attacks reported, hence commonly used in various security applications and protocols. +- SHA-3 has lower implementation costs and higher computational efficiency compared to SHA-2, but its current usage coverage is not as extensive as the SHA-2 series. + +Table 6-2 Common Hash Algorithms
+ +Figure 6-5 Separate Chaining Hash Table
+ +The operations of a hash table implemented with separate chaining have changed as follows: + +- **Querying Elements**: Input `key`, pass through the hash function to obtain the bucket index, access the head node of the list, then traverse the list and compare `key` to find the target key-value pair. +- **Adding Elements**: First access the list head node via the hash function, then add the node (key-value pair) to the list. +- **Deleting Elements**: Access the list head based on the hash function's result, then traverse the list to find and remove the target node. + +Separate chaining has the following limitations: + +- **Increased Space Usage**: The linked list contains node pointers, which consume more memory space than arrays. +- **Reduced Query Efficiency**: Due to the need for linear traversal of the list to find the corresponding element. + +The code below provides a simple implementation of a separate chaining hash table, with two things to note: + +- Lists (dynamic arrays) are used instead of linked lists for simplicity. In this setup, the hash table (array) contains multiple buckets, each of which is a list. +- This implementation includes a method for resizing the hash table. When the load factor exceeds $\frac{2}{3}$, we resize the hash table to twice its original size. + +=== "Python" + + ```python title="hash_map_chaining.py" + class HashMapChaining: + """链式地址哈希表""" + + def __init__(self): + """构造方法""" + self.size = 0 # 键值对数量 + self.capacity = 4 # 哈希表容量 + self.load_thres = 2.0 / 3.0 # 触发扩容的负载因子阈值 + self.extend_ratio = 2 # 扩容倍数 + self.buckets = [[] for _ in range(self.capacity)] # 桶数组 + + def hash_func(self, key: int) -> int: + """哈希函数""" + return key % self.capacity + + def load_factor(self) -> float: + """负载因子""" + return self.size / self.capacity + + def get(self, key: int) -> str | None: + """查询操作""" + index = self.hash_func(key) + bucket = self.buckets[index] + # 遍历桶,若找到 key ,则返回对应 val + for pair in bucket: + if pair.key == key: + return pair.val + # 若未找到 key ,则返回 None + return None + + def put(self, key: int, val: str): + """添加操作""" + # 当负载因子超过阈值时,执行扩容 + if self.load_factor() > self.load_thres: + self.extend() + index = self.hash_func(key) + bucket = self.buckets[index] + # 遍历桶,若遇到指定 key ,则更新对应 val 并返回 + for pair in bucket: + if pair.key == key: + pair.val = val + return + # 若无该 key ,则将键值对添加至尾部 + pair = Pair(key, val) + bucket.append(pair) + self.size += 1 + + def remove(self, key: int): + """删除操作""" + index = self.hash_func(key) + bucket = self.buckets[index] + # 遍历桶,从中删除键值对 + for pair in bucket: + if pair.key == key: + bucket.remove(pair) + self.size -= 1 + break + + def extend(self): + """扩容哈希表""" + # 暂存原哈希表 + buckets = self.buckets + # 初始化扩容后的新哈希表 + self.capacity *= self.extend_ratio + self.buckets = [[] for _ in range(self.capacity)] + self.size = 0 + # 将键值对从原哈希表搬运至新哈希表 + for bucket in buckets: + for pair in bucket: + self.put(pair.key, pair.val) + + def print(self): + """打印哈希表""" + for bucket in self.buckets: + res = [] + for pair in bucket: + res.append(str(pair.key) + " -> " + pair.val) + print(res) + ``` + +=== "C++" + + ```cpp title="hash_map_chaining.cpp" + /* 链式地址哈希表 */ + class HashMapChaining { + private: + int size; // 键值对数量 + int capacity; // 哈希表容量 + double loadThres; // 触发扩容的负载因子阈值 + int extendRatio; // 扩容倍数 + vectorFigure 6-6 Distribution of Key-Value Pairs in Open Addressing (Linear Probing) Hash Table
+ +However, **linear probing tends to create "clustering"**. Specifically, the longer a continuous position in the array is occupied, the more likely these positions are to encounter hash collisions, further promoting the growth of these clusters and eventually leading to deterioration in the efficiency of operations. + +It's important to note that **we cannot directly delete elements in an open addressing hash table**. Deleting an element creates an empty bucket `None` in the array. When searching for elements, if linear probing encounters this empty bucket, it will return, making the elements below this bucket inaccessible. The program may incorrectly assume these elements do not exist, as shown in the Figure 6-7 . + +![Query Issues Caused by Deletion in Open Addressing](hash_collision.assets/hash_table_open_addressing_deletion.png){ class="animation-figure" } + +Figure 6-7 Query Issues Caused by Deletion in Open Addressing
+ +To solve this problem, we can use a "lazy deletion" mechanism: instead of directly removing elements from the hash table, **use a constant `TOMBSTONE` to mark the bucket**. In this mechanism, both `None` and `TOMBSTONE` represent empty buckets and can hold key-value pairs. However, when linear probing encounters `TOMBSTONE`, it should continue traversing since there may still be key-value pairs below it. + +However, **lazy deletion may accelerate the degradation of hash table performance**. Every deletion operation produces a delete mark, and as `TOMBSTONE` increases, so does the search time, as linear probing may have to skip multiple `TOMBSTONE` to find the target element. + +Therefore, consider recording the index of the first `TOMBSTONE` encountered during linear probing and swapping the target element found with this `TOMBSTONE`. The advantage of this is that each time a query or addition is performed, the element is moved to a bucket closer to the ideal position (starting point of probing), thereby optimizing the query efficiency. + +The code below implements an open addressing (linear probing) hash table with lazy deletion. To make fuller use of the hash table space, we treat the hash table as a "circular array," continuing to traverse from the beginning when the end of the array is passed. + +=== "Python" + + ```python title="hash_map_open_addressing.py" + class HashMapOpenAddressing: + """开放寻址哈希表""" + + def __init__(self): + """构造方法""" + self.size = 0 # 键值对数量 + self.capacity = 4 # 哈希表容量 + self.load_thres = 2.0 / 3.0 # 触发扩容的负载因子阈值 + self.extend_ratio = 2 # 扩容倍数 + self.buckets: list[Pair | None] = [None] * self.capacity # 桶数组 + self.TOMBSTONE = Pair(-1, "-1") # 删除标记 + + def hash_func(self, key: int) -> int: + """哈希函数""" + return key % self.capacity + + def load_factor(self) -> float: + """负载因子""" + return self.size / self.capacity + + def find_bucket(self, key: int) -> int: + """搜索 key 对应的桶索引""" + index = self.hash_func(key) + first_tombstone = -1 + # 线性探测,当遇到空桶时跳出 + while self.buckets[index] is not None: + # 若遇到 key ,返回对应的桶索引 + if self.buckets[index].key == key: + # 若之前遇到了删除标记,则将键值对移动至该索引处 + if first_tombstone != -1: + self.buckets[first_tombstone] = self.buckets[index] + self.buckets[index] = self.TOMBSTONE + return first_tombstone # 返回移动后的桶索引 + return index # 返回桶索引 + # 记录遇到的首个删除标记 + if first_tombstone == -1 and self.buckets[index] is self.TOMBSTONE: + first_tombstone = index + # 计算桶索引,越过尾部则返回头部 + index = (index + 1) % self.capacity + # 若 key 不存在,则返回添加点的索引 + return index if first_tombstone == -1 else first_tombstone + + def get(self, key: int) -> str: + """查询操作""" + # 搜索 key 对应的桶索引 + index = self.find_bucket(key) + # 若找到键值对,则返回对应 val + if self.buckets[index] not in [None, self.TOMBSTONE]: + return self.buckets[index].val + # 若键值对不存在,则返回 None + return None + + def put(self, key: int, val: str): + """添加操作""" + # 当负载因子超过阈值时,执行扩容 + if self.load_factor() > self.load_thres: + self.extend() + # 搜索 key 对应的桶索引 + index = self.find_bucket(key) + # 若找到键值对,则覆盖 val 并返回 + if self.buckets[index] not in [None, self.TOMBSTONE]: + self.buckets[index].val = val + return + # 若键值对不存在,则添加该键值对 + self.buckets[index] = Pair(key, val) + self.size += 1 + + def remove(self, key: int): + """删除操作""" + # 搜索 key 对应的桶索引 + index = self.find_bucket(key) + # 若找到键值对,则用删除标记覆盖它 + if self.buckets[index] not in [None, self.TOMBSTONE]: + self.buckets[index] = self.TOMBSTONE + self.size -= 1 + + def extend(self): + """扩容哈希表""" + # 暂存原哈希表 + buckets_tmp = self.buckets + # 初始化扩容后的新哈希表 + self.capacity *= self.extend_ratio + self.buckets = [None] * self.capacity + self.size = 0 + # 将键值对从原哈希表搬运至新哈希表 + for pair in buckets_tmp: + if pair not in [None, self.TOMBSTONE]: + self.put(pair.key, pair.val) + + def print(self): + """打印哈希表""" + for pair in self.buckets: + if pair is None: + print("None") + elif pair is self.TOMBSTONE: + print("TOMBSTONE") + else: + print(pair.key, "->", pair.val) + ``` + +=== "C++" + + ```cpp title="hash_map_open_addressing.cpp" + /* 开放寻址哈希表 */ + class HashMapOpenAddressing { + private: + int size; // 键值对数量 + int capacity = 4; // 哈希表容量 + const double loadThres = 2.0 / 3.0; // 触发扩容的负载因子阈值 + const int extendRatio = 2; // 扩容倍数 + vector