summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--bibtex.bib17
-rw-r--r--book.tex132
3 files changed, 146 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index 15a94bd..a132e3f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,7 @@
main: *.tex
- tectonic book.tex
+ latexmk -pdf book.tex
+
+clean:
+ rm -f texput.log report.{aux,log,fls,fdb_latexmk,pdf}
diff --git a/bibtex.bib b/bibtex.bib
new file mode 100644
index 0000000..fa45d92
--- /dev/null
+++ b/bibtex.bib
@@ -0,0 +1,17 @@
+@inproceedings{Patrascu:2011:PST:1993636.1993638,
+ author = {Patrascu, Mihai and Thorup, Mikkel},
+ title = {The Power of Simple Tabulation Hashing},
+ booktitle = {Proceedings of the Forty-third Annual ACM Symposium on Theory of Computing},
+ series = {STOC '11},
+ year = {2011},
+ isbn = {978-1-4503-0691-1},
+ location = {San Jose, California, USA},
+ pages = {1--10},
+ numpages = {10},
+ url = {http://doi.acm.org/10.1145/1993636.1993638},
+ doi = {10.1145/1993636.1993638},
+ acmid = {1993638},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {concentration bounds, cuckoo hashing, independence, linear probing, minwise independence, tabulation hashing},
+}
diff --git a/book.tex b/book.tex
index fb1193c..deccabf 100644
--- a/book.tex
+++ b/book.tex
@@ -1,16 +1,26 @@
\documentclass[a4paper]{book}
-\usepackage[a4paper, top=3cm,bottom=3cm, left=3cm, right=3cm]{geometry}
+\usepackage[a4paper, top=3cm,bottom=3cm, left=3cm, right=3cm, paperheight=23cm]{geometry}
\usepackage{amsmath}
+\usepackage{amsfonts}
\usepackage{tikz}
\usetikzlibrary{positioning,calc,shapes}
+\usepackage{hyperref}
+\usepackage[backend=bibtex]{biblatex}
+\bibliography{bibtex}
+
+\usepackage{lipsum}
\title{Advanced Data Structures}
\author{Martin Hafskjold Thoresen}
\date{\today}
\newcommand{\topics}[1]{Topics: \textit{#1}}
-\newcommand{\code}[1]{\textsc{#1}}
+\newcommand{\code}[2]{\textsc{#1}$(#2)$}
+\newenvironment{example}[0]{{\parindent=0em \textbf{Example:}}}{\vspace{1em}}
+\newenvironment{definition}[1]{{\parindent=0em \textbf{Definition} --- \emph{#1}:\\}}{\vspace{1em}}
+% Expectation symbol
+\DeclareMathOperator*{\E}{\mathbb{E}}
\begin{document}
\maketitle
@@ -22,12 +32,119 @@ The chapters are arranged in the same way as the lectures, and some chapters cov
\tableofcontents
\chapter{Hashing}
-\topics{Universal hashing, tabulation hashing, hash tables, cuckoo hashing.}
+\topics{Universal hashing, tabulation hashing, hash tables, chaining, linear probing, cuckoo hashing.}
\section{Motivation}
-% Operations we need: \code{Update}
-\textsc{asd}
-We already know how to
+Operations we need: \code{Query}{x}, \code{Insert}{x}, \code{Delete}{x},
+where $x \in [\mathcal{U}]$.
+$\mathcal{U}$ is called the \emph{Universe}.
+We already know how to do this using Balanced Binary Search Trees (BBSTs), with $O(n)$ space and $O(\log n)$ time on all operations.
+We would like to have $O(1)$ time, while still having $O(n)$ space.
+
+\section{The Hash Function}
+
+A \emph{Hash Function} is a function $h: [\mathcal{U}] \rightarrow [m]$ where $\mathcal{U} \gg m$.
+$m$ is the table size.
+Ideally, $h$ is a totally random family of hash functions, meaning there is no correlation between its input $u \in \mathcal{U}$ and $h(u)$.
+However, encoding a totally random function takes $O(u \log m)$ space.
+
+This is a problem since $n = U$, which is large.
+Therefore we settle on a family $\mathcal{H}$ of hash functions of small size which is \emph{Universal}.
+Universality means that the probability of two non-equal keys having the same hash value is $O(1/m)$:
+$$\forall x \neq y\ P_{h \in \mathcal{H}}[h(x) = h(y)] = O(1/m)$$
+
+\begin{example}
+ $h(x) = ((a x) \mod p) \mod m$ where $0<a<p$, $p$ is prime and $p > m$ is a universal hash function.
+\end{example}
+
+\begin{definition}{k-independent}
+ A family of hash functions $\mathcal{H}$ is k-independent if
+ $$\forall x_1, \dots x_k\ \Pr[\bigwedge\limits_{i} h(x_i) = t_i] = O(1/m^k)$$
+\end{definition}
+
+\begin{example}
+ $((ax + b) \mod p) \mod m$ is 2-independent.
+\end{example}
+
+\begin{example}
+ A polynomial of degree $k$,
+ $((\sum\limits_{i=0}^{k-1} a_i x^i)\mod p) \mod m$, is k-independent.
+\end{example}
+
+\section{Simple Tabulation Hashing}
+
+Tabulation hashing is a hashing scheme.
+We view $x$ as a vector of bit blocks $x_1, \dots, x_c$,
+and use a totally random hash table on \emph{each} block.
+Then we \texttt{xor} the blocks together to get the final value:
+$$h(x) = T_1(x_1) \oplus T_2(x_2) \oplus \dots \oplus T_c(x_c)$$
+Since a block is $u/c$ bits, the universe for one block is of size $u^{1/c}$.
+One table is of size $u^{1/c}$ (we assume the hash value are machine words, as in~\cite{Patrascu:2011:PST:1993636.1993638}),
+so the total space used is $O(cu^{1/c})$.
+The time spent is $O(c)$, since each lookup is $O(1)$.
+Simple tabulation hashing is 3-independent, but not 4-independent.
+
+
+\section{Chaining}
+Because of the birthday paradox, hashing collisions in the table are very probable\footnote{unless we know all keys up front, and $n\leq m$}.
+Therefore we need a scheme to handle collisions, the simplest of which is \emph{chaining}.
+Each buchet in the hash table is a linked list of the values mapping to that bucket.
+
+What can we say about the length of the chain?
+Let $C_t$ be the length of chain $t$.
+$\E[C_t] = \sum\limits_i \Pr[h(x_i) = t]$
+If we have universal hashing we know that $\Pr[h(x_i) = t] = O(1/m)$,
+so $\E[C_t] = O(1)$, for $m = \Omega(n)$
+
+Since we need to traverse the list for all operations, the cost is quadratic in $C_t$,
+so we are interested in $\E[C_t^2]$:
+$$ \E[C_t^2] = 1/m \sum_{s \in [m]} E[C_s^2] = 1/m \sum_{i \neq j} \Pr[h(x_i) = h(x_j)] $$
+If we have universal hashing this is $\frac{1}{m} n^2 O(\frac{1}{m}) = O(n^2/m^2) = O(1)$ for $m = \Omega(n)$.
+With a totally random hash function $C_t = O(\frac{\log n}{\log\log n})$ with probability $1 - 1/n^c$ for any $c$.
+This also holds for simple tabulation hashing.
+
+
+\section{Perfect Hashing (FKS hashing)}
+
+Idea: resolve collisions with another layer of hash tables.
+
+On collision in bucket tables, rebuild the table using a different hash function.
+When $n$ gets too large, double $m$ and start over, in order to keep the number of elements in the bucket tables small.
+If a bucket table is too large, double its size and rehash.
+
+Since $\E[C_t]$ is the number of elements that should not collide in the table, we can adjust the table size in such a way that
+$\E[C_t] \leq 1/2$, so $\Pr[\text{no collisions in } C_t] \geq 1/2$.
+Use size $\Theta(C_t^2)$ for the bucket tables.
+This makes the expected number of rebuilds of the bucket tables $O(1)$ before getting zero collisions.
+
+$\E[\text{space}] = \Theta(m + \sum\limits_t C_t^2) = \Theta(m + n^2/m) = \Theta(n)$ for $m=\Theta(n)$.
+
+Results: $O(1)$ deterministic query, $O(n)$ expected update (w.h.p.), $O(n)$ space.
+
+
+\section{Linear Probing}
+
+Idea: store values directly in the table. On collision, look for next available spot.
+On deletion, replace element with a ``tombstone'', so that searches does not stop too early.
+Great for cache performance.
+The main problem of linear probing is the increasing lengths of the ``runs'',
+that is intervals of non-empty cells.
+
+We require $m \geq (1+\epsilon) n$ (not just $m=\Omega(n)$), in order to have available cells in the table.
+Space is naturally $O(m)$.
+With a totally random hashing function, or by using tabulation hashing, the expected time for operations is $O(1/\epsilon^2)$
+With a $O(\log n)$-wise or 5-wise independent hashing function, constant time is expected.
+
+
+\section{Cuckoo Hashing}
+Idea: have two hash tables with different hashing functions.
+On collision, swap the colliding values, and try to insert the swapped value in the other table.
+Deletes are simple.
+If we get a cycle (swap $a$ with $b$, swap $b$ with $c$, and $c$ hashes to $a$ again), we rebuild the tables.
+The table sizes are $m \geq (1+\epsilon)n$, so the space used is $O((2+\epsilon)n)$.
+
+Any value $x$ is either in $A[h_A(x)]$ or $B[h_B(x)] \implies O(1)$ deterministic query.
+If the hashing functions are fully random or $\log n$-wise independent we get $O(1)$ expected update and $O(1/n)$ probability of failure.
@@ -54,6 +171,7 @@ Dynamic connectivity on trees, euler tour trees.
Dynamic partial sums, dynamic connectivity.
+
\chapter{Integer Structures}
van Emde Boas, x-fast trees, y-fast trees, fusion trees.
@@ -65,7 +183,7 @@ Rank, Select
Locks, Lock-free structures, lists, priority queues.
-
+\printbibliography%
\end{document}