slds-lmu · github-actions · Aug 9, 2024
diff --git a/latex-math/basic-math.tex b/latex-math/basic-math.tex
@@ -1,3 +1,4 @@
+% dependencies: amsmath, amssymb, dsfont
 % math spaces
 \ifdefined\N
 \renewcommand{\N}{\mathds{N}} % N, naturals
@@ -6,7 +7,7 @@
 \newcommand{\Q}{\mathds{Q}} % Q, rationals
 \newcommand{\R}{\mathds{R}} % R, reals
 \ifdefined\C
-  \renewcommand{\C}{\mathds{C}} % C, complex
+\renewcommand{\C}{\mathds{C}} % C, complex
 \else \newcommand{\C}{\mathds{C}} \fi
 \newcommand{\continuous}{\mathcal{C}} % C, space of continuous functions
 \newcommand{\M}{\mathcal{M}} % machine numbers
@@ -41,17 +42,19 @@
 \newcommand{\sumik}{\sum\limits_{i=1}^k} % summation from i=1 to k
 \newcommand{\sumkg}{\sum\limits_{k=1}^g} % summation from k=1 to g
 \newcommand{\sumjg}{\sum\limits_{j=1}^g} % summation from j=1 to g
+\newcommand{\summM}{\sum\limits_{m=1}^M} % summation from m=1 to M
 \newcommand{\meanin}{\frac{1}{n} \sum\limits_{i=1}^n} % mean from i=1 to n
 \newcommand{\meanim}{\frac{1}{m} \sum\limits_{i=1}^m} % mean from i=1 to n
 \newcommand{\meankg}{\frac{1}{g} \sum\limits_{k=1}^g} % mean from k=1 to g
+\newcommand{\meanmM}{\frac{1}{M} \sum\limits_{m=1}^M} % mean from m=1 to M
 \newcommand{\prodin}{\prod\limits_{i=1}^n} % product from i=1 to n
 \newcommand{\prodkg}{\prod\limits_{k=1}^g} % product from k=1 to g
 \newcommand{\prodjp}{\prod\limits_{j=1}^p} % product from j=1 to p
 
 % linear algebra
-\newcommand{\one}{\boldsymbol{1}} % 1, unitvector
+\newcommand{\one}{\bm{1}} % 1, unitvector
 \newcommand{\zero}{\mathbf{0}} % 0-vector
-\newcommand{\id}{\boldsymbol{I}} % I, identity
+\newcommand{\id}{\bm{I}} % I, identity
 \newcommand{\diag}{\operatorname{diag}} % diag, diagonal
 \newcommand{\trace}{\operatorname{tr}} % tr, trace
 \newcommand{\spn}{\operatorname{span}} % span

diff --git a/latex-math/basic-ml.tex b/latex-math/basic-ml.tex
@@ -1,7 +1,7 @@
 % machine learning
 \newcommand{\Xspace}{\mathcal{X}} % X, input space
 \newcommand{\Yspace}{\mathcal{Y}} % Y, output space
-\newcommand{\Zspace}{\mathcal{Z}} % Space of sampled datapoints ! Also defined identically in ml-online.tex !
+\newcommand{\Zspace}{\mathcal{Z}} % Z, space of sampled datapoints
 \newcommand{\nset}{\{1, \ldots, n\}} % set from 1 to n
 \newcommand{\pset}{\{1, \ldots, p\}} % set from 1 to p
 \newcommand{\gset}{\{1, \ldots, g\}} % set from 1 to g
@@ -26,6 +26,7 @@
 \newcommand{\xdat}{\left\{ \xv^{(1)}, \ldots, \xv^{(n)}\right\}} % {x1, ..., xn}, input data
 \newcommand{\ydat}{\left\{ \yv^{(1)}, \ldots, \yv^{(n)}\right\}} % {y1, ..., yn}, input data
 \newcommand{\yvec}{\left(y^{(1)}, \hdots, y^{(n)}\right)^\top} % (y1, ..., yn), vector of outcomes
+\newcommand{\greekxi}{\xi} % Greek letter xi
 \renewcommand{\xi}[1][i]{\xv^{(#1)}} % x^i, i-th observed value of x
 \newcommand{\yi}[1][i]{y^{(#1)}} % y^i, i-th observed value of y
 \newcommand{\xivec}{\left(x^{(i)}_1, \ldots, x^{(i)}_p\right)^\top} % (x1^i, ..., xp^i), i-th observation vector
@@ -54,10 +55,10 @@
 \newcommand{\fkx}[1][k]{f_{#1}(\xv)} % f_j(x), discriminant component function
 \newcommand{\fh}{\hat{f}} % f hat, estimated prediction function
 \newcommand{\fxh}{\fh(\xv)} % fhat(x)
-\newcommand{\fxt}{f(\xv ~|~ \thetab)} % f(x | theta)
+\newcommand{\fxt}{f(\xv ~|~ \thetav)} % f(x | theta)
 \newcommand{\fxi}{f\left(\xv^{(i)}\right)} % f(x^(i))
 \newcommand{\fxih}{\hat{f}\left(\xv^{(i)}\right)} % f(x^(i))
-\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetab\right)} % f(x^(i) | theta)
+\newcommand{\fxit}{f\left(\xv^{(i)} ~|~ \thetav\right)} % f(x^(i) | theta)
 \newcommand{\fhD}{\fh_{\D}} % fhat_D, estimate of f based on D
 \newcommand{\fhDtrain}{\fh_{\Dtrain}} % fhat_Dtrain, estimate of f based on D
 \newcommand{\fhDnlam}{\fh_{\Dn, \lamv}} %model learned on Dn with hp lambda
@@ -69,9 +70,9 @@
 \newcommand{\hx}{h(\xv)} % h(x), discrete prediction function
 \newcommand{\hh}{\hat{h}} % h hat
 \newcommand{\hxh}{\hat{h}(\xv)} % hhat(x)
-\newcommand{\hxt}{h(\xv | \thetab)} % h(x | theta)
+\newcommand{\hxt}{h(\xv | \thetav)} % h(x | theta)
 \newcommand{\hxi}{h\left(\xi\right)} % h(x^(i))
-\newcommand{\hxit}{h\left(\xi ~|~ \thetab\right)} % h(x^(i) | theta)
+\newcommand{\hxit}{h\left(\xi ~|~ \thetav\right)} % h(x^(i) | theta)
 \newcommand{\hbayes}{h^{\ast}} % Bayes-optimal classification model
 \newcommand{\hxbayes}{h^{\ast}(\xv)} % Bayes-optimal classification model
 
@@ -82,27 +83,27 @@
 
 % theta
 \newcommand{\thetah}{\hat{\theta}} % theta hat
-\newcommand{\thetab}{\bm{\theta}} % theta vector
-\newcommand{\thetabh}{\bm{\hat\theta}} % theta vector hat
-\newcommand{\thetat}[1][t]{\thetab^{[#1]}} % theta^[t] in optimization
-\newcommand{\thetatn}[1][t]{\thetab^{[#1 +1]}} % theta^[t+1] in optimization
-\newcommand{\thetahDnlam}{\thetabh_{\Dn, \lamv}} %theta learned on Dn with hp lambda
-\newcommand{\thetahDlam}{\thetabh_{\D, \lamv}} %theta learned on D with hp lambda
-\newcommand{\mint}{\min_{\thetab \in \Theta}} % min problem theta
-\newcommand{\argmint}{\argmin_{\thetab \in \Theta}} % argmin theta
+\newcommand{\thetav}{\bm{\theta}} % theta vector
+\newcommand{\thetavh}{\bm{\hat\theta}} % theta vector hat
+\newcommand{\thetat}[1][t]{\thetav^{[#1]}} % theta^[t] in optimization
+\newcommand{\thetatn}[1][t]{\thetav^{[#1 +1]}} % theta^[t+1] in optimization
+\newcommand{\thetahDnlam}{\thetavh_{\Dn, \lamv}} %theta learned on Dn with hp lambda
+\newcommand{\thetahDlam}{\thetavh_{\D, \lamv}} %theta learned on D with hp lambda
+\newcommand{\mint}{\min_{\thetav \in \Theta}} % min problem theta
+\newcommand{\argmint}{\argmin_{\thetav \in \Theta}} % argmin theta
 
 % densities + probabilities
 % pdf of x
 \newcommand{\pdf}{p} % p
 \newcommand{\pdfx}{p(\xv)} % p(x)
-\newcommand{\pixt}{\pi(\xv~|~ \thetab)} % pi(x|theta), pdf of x given theta
-\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetab\right)} % pi(x^i|theta), pdf of x given theta
+\newcommand{\pixt}{\pi(\xv~|~ \thetav)} % pi(x|theta), pdf of x given theta
+\newcommand{\pixit}[1][i]{\pi\left(\xi[#1] ~|~ \thetav\right)} % pi(x^i|theta), pdf of x given theta
 \newcommand{\pixii}[1][i]{\pi\left(\xi[#1]\right)} % pi(x^i), pdf of i-th x
 
 % pdf of (x, y)
 \newcommand{\pdfxy}{p(\xv,y)} % p(x, y)
-\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetab)} % p(x, y | theta)
-\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetab\right)} % p(x^(i), y^(i) | theta)
+\newcommand{\pdfxyt}{p(\xv, y ~|~ \thetav)} % p(x, y | theta)
+\newcommand{\pdfxyit}{p\left(\xi, \yi ~|~ \thetav\right)} % p(x^(i), y^(i) | theta)
 
 % pdf of x given y
 \newcommand{\pdfxyk}[1][k]{p(\xv | y= #1)} % p(x | y = k)
@@ -112,7 +113,7 @@
 % prior probabilities
 \newcommand{\pik}[1][k]{\pi_{#1}} % pi_k, prior
 \newcommand{\lpik}[1][k]{\log \pi_{#1}} % log pi_k, log of the prior
-\newcommand{\pit}{\pi(\thetab)} % Prior probability of parameter theta
+\newcommand{\pit}{\pi(\thetav)} % Prior probability of parameter theta
 
 % posterior probabilities
 \newcommand{\post}{\P(y = 1 ~|~ \xv)} % P(y = 1 | x), post. prob for y=1
@@ -123,13 +124,13 @@
 \newcommand{\pix}{\pi(\xv)} % pi(x), P(y = 1 | x)
 \newcommand{\piv}{\bm{\pi}} % pi, bold, as vector
 \newcommand{\pikx}[1][k]{\pi_{#1}(\xv)} % pi_k(x), P(y = k | x)
-\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetab)} % pi_k(x | theta), P(y = k | x, theta)
+\newcommand{\pikxt}[1][k]{\pi_{#1}(\xv ~|~ \thetav)} % pi_k(x | theta), P(y = k | x, theta)
 \newcommand{\pixh}{\hat \pi(\xv)} % pi(x) hat, P(y = 1 | x) hat
 \newcommand{\pikxh}[1][k]{\hat \pi_{#1}(\xv)} % pi_k(x) hat, P(y = k | x) hat
 \newcommand{\pixih}{\hat \pi(\xi)} % pi(x^(i)) with hat
 \newcommand{\pikxih}[1][k]{\hat \pi_{#1}(\xi)} % pi_k(x^(i)) with hat
-\newcommand{\pdfygxt}{p(y ~|~\xv, \thetab)} % p(y | x, theta)
-\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetab\right)} % p(y^i |x^i, theta)
+\newcommand{\pdfygxt}{p(y ~|~\xv, \thetav)} % p(y | x, theta)
+\newcommand{\pdfyigxit}{p\left(\yi ~|~\xi, \thetav\right)} % p(y^i |x^i, theta)
 \newcommand{\lpdfygxt}{\log \pdfygxt } % log p(y | x, theta)
 \newcommand{\lpdfyigxit}{\log \pdfyigxit} % log p(y^i |x^i, theta)
 
@@ -139,8 +140,10 @@
 
 % residual and margin
 \newcommand{\eps}{\epsilon} % residual, stochastic
+\newcommand{\epsv}{\bm{\epsilon}} % residual, stochastic, as vector
 \newcommand{\epsi}{\epsilon^{(i)}} % epsilon^i, residual, stochastic
 \newcommand{\epsh}{\hat{\epsilon}} % residual, estimated
+\newcommand{\epsvh}{\hat{\epsv}} % residual, estimated, vector
 \newcommand{\yf}{y \fx} % y f(x), margin
 \newcommand{\yfi}{\yi \fxi} % y^i f(x^i), margin
 \newcommand{\Sigmah}{\hat \Sigma} % estimated covariance matrix
@@ -153,7 +156,7 @@
 \newcommand{\Lxyi}{L\left(\yi, \fxi\right)} % loss of observation
 \newcommand{\Lxyt}{L\left(y, \fxt\right)} % loss with f parameterized
 \newcommand{\Lxyit}{L\left(\yi, \fxit\right)} % loss of observation with f parameterized
-\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetab\right)\right)} % loss of observation with f parameterized
+\newcommand{\Lxym}{L\left(\yi, f\left(\bm{\tilde{x}}^{(i)} ~|~ \thetav\right)\right)} % loss of observation with f parameterized
 \newcommand{\Lpixy}{L\left(y, \pix\right)} % loss in classification
 \newcommand{\Lpiv}{L\left(y, \piv\right)} % loss in classification
 \newcommand{\Lpixyi}{L\left(\yi, \pixii\right)} % loss of observation in classification
@@ -171,26 +174,26 @@
 \newcommand{\riskbayes}{\mathcal{R}^\ast}
 \newcommand{\riskf}{\risk(f)} % R(f), risk
 \newcommand{\riskdef}{\E_{y|\xv}\left(\Lxy \right)} % risk def (expected loss)
-\newcommand{\riskt}{\mathcal{R}(\thetab)} % R(theta), risk
+\newcommand{\riskt}{\mathcal{R}(\thetav)} % R(theta), risk
 \newcommand{\riske}{\mathcal{R}_{\text{emp}}} % R_emp, empirical risk w/o factor 1 / n
 \newcommand{\riskeb}{\bar{\mathcal{R}}_{\text{emp}}} % R_emp, empirical risk w/ factor 1 / n
 \newcommand{\riskef}{\riske(f)} % R_emp(f)
-\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetab)} % R_emp(theta)
+\newcommand{\risket}{\mathcal{R}_{\text{emp}}(\thetav)} % R_emp(theta)
 \newcommand{\riskr}{\mathcal{R}_{\text{reg}}} % R_reg, regularized risk
-\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetab)} % R_reg(theta)
+\newcommand{\riskrt}{\mathcal{R}_{\text{reg}}(\thetav)} % R_reg(theta)
 \newcommand{\riskrf}{\riskr(f)} % R_reg(f)
-\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetab)} % hat R_reg(theta)
-\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetab)} % hat R_emp(theta)
+\newcommand{\riskrth}{\hat{\mathcal{R}}_{\text{reg}}(\thetav)} % hat R_reg(theta)
+\newcommand{\risketh}{\hat{\mathcal{R}}_{\text{emp}}(\thetav)} % hat R_emp(theta)
 \newcommand{\LL}{\mathcal{L}} % L, likelihood
-\newcommand{\LLt}{\mathcal{L}(\thetab)} % L(theta), likelihood
-\newcommand{\LLtx}{\mathcal{L}(\thetab | \xv)} % L(theta|x), likelihood
+\newcommand{\LLt}{\mathcal{L}(\thetav)} % L(theta), likelihood
+\newcommand{\LLtx}{\mathcal{L}(\thetav | \xv)} % L(theta|x), likelihood
 \newcommand{\logl}{\ell} % l, log-likelihood
-\newcommand{\loglt}{\logl(\thetab)} % l(theta), log-likelihood
-\newcommand{\logltx}{\logl(\thetab | \xv)} % l(theta|x), log-likelihood
+\newcommand{\loglt}{\logl(\thetav)} % l(theta), log-likelihood
+\newcommand{\logltx}{\logl(\thetav | \xv)} % l(theta|x), log-likelihood
 \newcommand{\errtrain}{\text{err}_{\text{train}}} % training error
 \newcommand{\errtest}{\text{err}_{\text{test}}} % test error
 \newcommand{\errexp}{\overline{\text{err}_{\text{test}}}} % avg training error
 
 % lm
-\newcommand{\thx}{\thetab^\top \xv} % linear model
+\newcommand{\thx}{\thetav^\top \xv} % linear model
 \newcommand{\olsest}{(\Xmat^\top \Xmat)^{-1} \Xmat^\top \yv} % OLS estimator in LM
diff --git a/latex-math/ml-ensembles.tex b/latex-math/ml-ensembles.tex
@@ -2,12 +2,25 @@
 \newcommand{\bl}[1][m]{b^{[#1]}} % baselearner, default m
 \newcommand{\blh}[1][m]{\hat{b}^{[#1]}} % estimated base learner, default m 
 \newcommand{\blx}[1][m]{b^{[#1]}(\xv)} % baselearner, default m
+\newcommand{\blf}[1][m]{f^{[#1]}} % baselearner: scores, default m
+\newcommand{\blfh}[1][m]{\hat{f}^{[#1]}} % estimated baselearner: scores, default m
+\newcommand{\blfhx}[1][m]{\hat{f}^{[#1]}(\xv)} % estimated baselearner: scores of x, default m
+\newcommand{\bll}[1][m]{h^{[#1]}} % baselearner: hard labels, default m
+\newcommand{\bllh}[1][m]{\hat{h}^{[#1]}} % estimated baselearner: hard labels, default m
+\newcommand{\bllhx}[1][m]{\hat{h}^{[#1]}(\xv)} % estimated baselearner: hard labels of x, default m
+\newcommand{\blp}[1][m]{\pi^{[#1]}} % baselearner: probabilities, default m
+\newcommand{\blph}[1][m]{\hat{\pi}^{[#1]}} % estimated baselearner: probabilities, default m
+\newcommand{\blphxk}[1][m]{\hat{\pi}_{k}^{[#1]}(\xv)} % estimated baselearner: probabilities of x for class k, default m
 \newcommand{\fM}{f^{[M]}(\xv)} % ensembled predictor
 \newcommand{\fMh}{\hat f^{[M]}(\xv)} % estimated ensembled predictor
 \newcommand{\ambifM}{\Delta\left(\fM\right)} % ambiguity/instability of ensemble
 \newcommand{\betam}[1][m]{\beta^{[#1]}} % weight of basemodel m
 \newcommand{\betamh}[1][m]{\hat{\beta}^{[#1]}} % weight of basemodel m with hat
 \newcommand{\betaM}{\beta^{[M]}} % last baselearner
+\newcommand{\ib}{\mathrm{IB}} % In-Bag (IB)
+\newcommand{\ibm}{\ib^{[m]}} % In-Bag (IB) for m-th bootstrap
+\newcommand{\oob}{\mathrm{OOB}} % Out-of-Bag (OOB)
+\newcommand{\oobm}{\oob^{[m]}} % Out-of-Bag (OOB) for m-th bootstrap
 
 % ml - boosting
 \newcommand{\fm}[1][m]{f^{[#1]}} % prediction in iteration m
@@ -17,9 +30,9 @@
 \newcommand{\errm}[1][m]{\text{err}^{[#1]}} % weighted in-sample misclassification rate
 \newcommand{\wm}[1][m]{w^{[#1]}} % weight vector of basemodel m
 \newcommand{\wmi}[1][m]{w^{[#1](i)}} % weight of obs i of basemodel m
-\newcommand{\thetam}[1][m]{\thetab^{[#1]}} % parameters of basemodel m
-\newcommand{\thetamh}[1][m]{\hat{\thetab}^{[#1]}} % parameters of basemodel m with hat
-\newcommand{\blxt}[1][m]{b(\xv, \thetab^{[#1]})} % baselearner, default m
+\newcommand{\thetam}[1][m]{\thetav^{[#1]}} % parameters of basemodel m
+\newcommand{\thetamh}[1][m]{\hat{\thetav}^{[#1]}} % parameters of basemodel m with hat
+\newcommand{\blxt}[1][m]{b(\xv, \thetav^{[#1]})} % baselearner, default m
 \newcommand{\ens}{\sum_{m=1}^M \betam \blxt} % ensemble
 \newcommand{\rmm}[1][m]{\tilde{r}^{[#1]}} % pseudo residuals
 \newcommand{\rmi}[1][m]{\tilde{r}^{[#1](i)}} % pseudo residuals
@@ -33,6 +46,6 @@
 \newcommand{\Lpleft}{\Lp_{\text{left}}}
 
 % ml - boosting iml lecture
-\newcommand{\ts}{\thetab^{\star}} % theta*
-\newcommand{\bljt}{\bl[j](\xv, \thetab)} % BL j with theta
+\newcommand{\ts}{\thetav^{\star}} % theta*
+\newcommand{\bljt}{\bl[j](\xv, \thetav)} % BL j with theta
 \newcommand{\bljts}{\bl[j](\xv, \ts)} % BL j with theta*
diff --git a/latex-math/ml-eval.tex b/latex-math/ml-eval.tex
@@ -41,7 +41,7 @@
 
 % performance measure
 \newcommand{\rhoL}{\rho_L} % perf. measure derived from pointwise loss
-\newcommand{\F}{\boldsymbol{F}} % matrix of prediction scores
+\newcommand{\F}{\bm{F}} % matrix of prediction scores
 \newcommand{\Fi}[1][i]{\F^{(#1)}} % i-th row vector of the predscore mat
 \newcommand{\FJ}[1][J]{\F_{#1}} % predscore mat idxvec J
 \newcommand{\FJf}{\FJ[J,f]} % predscore mat idxvec J and model f

diff --git a/latex-math/ml-infotheory.tex b/latex-math/ml-infotheory.tex
@@ -7,6 +7,6 @@
 \newcommand{\cdentyx}{- \int_{\Xspace, \Yspace} f(x, y) \cdot \log f(y | x) dx dy} % cond diff entropy y|x
 \newcommand{\xentpq}{- \sum_{x \in \Xspace} p(x) \cdot \log q(x)} % cross-entropy of p, q
 \newcommand{\kldpq}{D_{KL}(p \| q)} % KLD between p and q
-\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetab})} % KLD divergence between p and parameterized q
+\newcommand{\kldpqt}{D_{KL}(p \| q_{\thetav})} % KLD divergence between p and parameterized q
 \newcommand{\explogpq}{\E_p \left[\log \frac{p(X)}{q(X)} \right]} % expected LLR of p, q (def KLD)
 \newcommand{\sumlogpq}{\sum_{x \in \Xspace} p(x) \cdot \log \frac{p(x)}{q(x)}} % expected LLR of p, q (def KLD)
diff --git a/latex-math/ml-nn.tex b/latex-math/ml-nn.tex
@@ -19,18 +19,18 @@
 \newcommand{\Odropout}{\mathnormal{J}(\theta, \mu|X,y)} % dropout objective function
 
 % deeplearning - optimization
-\newcommand{\Loss}{L(y, f(\xv, \thetab))}
-\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetab + \varphi \nub))} % momentum risk
-\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetab}))} % Nesterov momentum risk
-\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetab))} 
+\newcommand{\Loss}{L(y, f(\xv, \thetav))}
+\newcommand{\Lmomentumnest}{L(\yi, f(x^{(i)}, \thetav + \varphi \nub))} % momentum risk
+\newcommand{\Lmomentumtilde}{L(\yi, f(x^{(i)}, \tilde{\thetav}))} % Nesterov momentum risk
+\newcommand{\Lmomentum}{L(\yi, f(x^{(i)}, \thetav))} 
 \newcommand{\Hess}{\mathbf{H}}
-\newcommand{\nub}{\boldsymbol{\nu}}
+\newcommand{\nub}{\bm{\nu}}
 
 % deeplearning - autoencoders
 \newcommand{\uauto}{L(x,g(f(x)))} % undercomplete autoencoder objective function
 \newcommand{\dauto}{L(x,g(f(\tilde{x})))} % denoising autoencoder objective function
 
 % deeplearning - adversarials 
-\newcommand{\deltab}{\boldsymbol{\delta}}
-\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetab))}
-\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetab))}
+\newcommand{\deltab}{\bm{\delta}}
+\newcommand{\Lossdeltai}{L(\yi, f(\xi + \deltab|\thetav))}
+\newcommand{\Lossdelta}{L(y, f(\xv + \deltab| \thetav))}
diff --git a/latex-math/ml-regu.tex b/latex-math/ml-regu.tex
@@ -0,0 +1,6 @@
+% \thetah is \hat{\theta}} (theta hat)
+% \thetav is \bm{\theta}}  (theta vector)
+\newcommand{\thetas}{\thetav^*} % theta star
+\newcommand{\thetaridge}{\thetav_{\mathrm{ridge}}}} % theta (RIDGE)
+\newcommand{\thetalasso}{\thetav_{\mathrm{LASSO}}}} % theta (LASSO)
+\newcommand{\thetaols}{\thetav_{\mathrm{OLS}}}} % theta (RIDGE)
diff --git a/latex-math/ml-svm.tex b/latex-math/ml-svm.tex
@@ -3,8 +3,8 @@
 \renewcommand{\sl}{\zeta} % slack variable
 \newcommand{\slvec}{\left(\zeta^{(1)}, \zeta^{(n)}\right)} % slack variable vector
 \newcommand{\sli}[1][i]{\zeta^{(#1)}} % i-th slack variable
-\newcommand{\scptxi}{\scp{\thetab}{\xi}} % scalar prodct of theta and xi
-\newcommand{\svmhplane}{\yi \left( \scp{\thetab}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized)
+\newcommand{\scptxi}{\scp{\thetav}{\xi}} % scalar prodct of theta and xi
+\newcommand{\svmhplane}{\yi \left( \scp{\thetav}{\xi} + \theta_0 \right)} % SVM hyperplane (normalized)
 \newcommand{\alphah}{\hat{\alpha}} % alpha-hat (basis fun coefficients)
 \newcommand{\alphav}{\bm{\alpha}} % vector alpha (bold) (basis fun coefficients)
 \newcommand{\alphavh}{\hat{\bm{\alpha}}} % vector alpha-hat (basis fun coefficients)
@@ -15,4 +15,4 @@
 \newcommand{\phix}{\phi(\xv)} % feature map x
 \newcommand{\phixt}{\phi(\tilde \xv)} % feature map x tilde
 \newcommand{\kxxt}{k(\xv, \tilde \xv)} % kernel fun x, x tilde
-\newcommand{\scptxifm}{\scp{\thetab}{\phi(\xi)}} % scalar prodct of theta and xi
+\newcommand{\scptxifm}{\scp{\thetav}{\phi(\xi)}} % scalar prodct of theta and xi