diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2015-09-23 10:28:42 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2015-09-23 10:29:24 -0400 |
| commit | fb90afefbf3e6d10164a4c1a496e71a688e600f0 (patch) | |
| tree | 243770c9e9ce42833ba94a62a901c3f0525cad16 | |
| parent | 33467ad66f8ff3000d157ab9ad0022cf96b2143a (diff) | |
| download | cs281-fb90afefbf3e6d10164a4c1a496e71a688e600f0.tar.gz | |
[hw1] Reporting results
| -rw-r--r-- | hw1/main.py | 14 | ||||
| -rw-r--r-- | hw1/main.tex | 34 | ||||
| -rw-r--r-- | hw1/plot.pdf | bin | 0 -> 12923 bytes |
3 files changed, 41 insertions, 7 deletions
diff --git a/hw1/main.py b/hw1/main.py index 3813fa7..ae48050 100644 --- a/hw1/main.py +++ b/hw1/main.py @@ -53,11 +53,11 @@ def train_qr(X, y): def log_posterior(w, X, y, sigma=1.0, tau=10): return (1 / (2 * sigma) * np.linalg.norm(y - np.dot(X, w)) ** 2 - + 1 / (2 * tau) * np.linalg.norm(w) ** 2) + + tau / 2 * np.linalg.norm(w) ** 2) def gradient(w, X, y, sigma=1.0, tau=10): - return (- 1 / sigma * np.dot(X.T, y - np.dot(X, w)) + 1 / tau * w) + return (- 1 / sigma * np.dot(X.T, y - np.dot(X, w)) + tau * w) def verify_gradient(eps=1e-10): @@ -88,9 +88,7 @@ def train_lbfgs(X, y): options={'maxiter': 100}).x -def map_features(X, d): - A = np.random.normal(0, 1, size=(d, X.shape[1])) - b = np.random.uniform(0, 2 * pi, d) +def map_features(A, b, X): return np.cos(np.dot(A, X.T).T + b) @@ -110,8 +108,10 @@ if __name__ == "__main__": lqr = [] llbfgs = [] for d in [100, 200, 400, 600]: - X = map_features(X_train, d) - X_t = map_features(X_test, d) + A = np.random.normal(0, 1, size=(d, X_train.shape[1])) + b = np.random.uniform(0, 2 * pi, d) + X = map_features(A, b, X_train) + X_t = map_features(A, b, X_test) t_start = time.time() wqr = train_qr(X, y_train) diff --git a/hw1/main.tex b/hw1/main.tex index 3aea165..ee9b915 100644 --- a/hw1/main.tex +++ b/hw1/main.tex @@ -20,6 +20,7 @@ \usepackage{amsfonts} \usepackage{listings} \usepackage{bm} +\usepackage{graphicx} % Some useful macros. \newcommand{\prob}{\mathbb{P}} @@ -398,6 +399,13 @@ training data using the QR decomposition (see Section 7.5.2 in Murphy's book). \end{itemize} \end{problem} +\paragraph{Solution} See file \textsf{main.py} for the code. I obtained: +\begin{align*} + \hat{w} = [&5.55782079, 2.25190765, 1.07880135, -5.91177796, -1.73480336,\\ + &-1.63875478 -0.26610556, 0.81781409, -0.65913397, 7.74153395] +\end{align*} +and a RMSE of 5.20880460. + \begin{problem}[14pts]\label{prob:numerical_linear_model} Write code that evaluates the log-posterior probability (up to an additive constant) of $\mathbf{w}$ according to the previous Bayesian linear model and the normalized training data. @@ -427,6 +435,15 @@ log-posterior probability to transform the maximization problem into a minimizat \end{itemize} \end{problem} +\paragraph{Solution} See file \textsf{main.py} for the code. I obtained: +\begin{align*} + \hat{w} = [&5.55780742, 2.2516761, 1.078904, -5.9118024, -1.73438477,\\ + &-1.63887755, -0.26611567, 0.81785313, -0.65901996, 7.7415115] +\end{align*} +and a RMSE of 5.20880242. + +\vspace{2em} + Linear regression can be extended to model non-linear relationships by replacing the original features $\mathbf{x}$ with some non-linear functions of the original features $\bm \phi(\mathbf{x})$. We can automatically generate one @@ -482,4 +499,21 @@ by the method L-BFGS for each value of $d$. Answer the following questions: \end{itemize} \end{problem} +\paragraph{Solution} See file \textsf{main.py} for the code. I obtained the following plot: +\begin{center} +\includegraphics[width=0.5\textwidth]{plot.pdf} +\end{center} + +We see that the non-linear transformations help reduce the prediction error: +the data is probably not very well modeled by a linear relation ship. Allowing +more complex relationships (as $d$ becomes larger the relationship becomes more +complex) fits it better. + +QR is faster than L-BFGS when the number of features is not too large ($d\leq +400$). For $d=600$, L-BFGS is faster. The asymptotic complexity of QR is +$O(nd^2 + d^3)$ whereas the complexity of L-BFGS is $O(nd)$ (for a fixed number +of iterations). So asymptotically, L-BFGS is be faster as we observe for +$d=600$ (for smaller values of $d$ the constants hidden in the $O$ do not allow +a formal comparison). + \end{document} diff --git a/hw1/plot.pdf b/hw1/plot.pdf Binary files differnew file mode 100644 index 0000000..37f2f38 --- /dev/null +++ b/hw1/plot.pdf |
