(this.webpackJsonpapp=this.webpackJsonpapp||[]).push([[0],[,,function(e,t,n){"use strict";n.d(t,"a",(function(){return o}));var a=n(42),o=function e(t,n,o,i,s,r,l){Object(a.a)(this,e),this.title=void 0,this.subtitle=void 0,this.publishDate=void 0,this.titleImageUrl=void 0,this.titleImageDescription=void 0,this.tags=void 0,this.content=void 0,this.title=t,this.subtitle=n,this.publishDate=o,this.titleImageUrl=i,this.titleImageDescription=s,this.tags=r,this.content=l}},function(e,t,n){e.exports={navbar:"Navbar_navbar__1kY7-",navbarHide:"Navbar_navbarHide__2vmnV",navbarHideFully:"Navbar_navbarHideFully__3zmeO",text:"Navbar_text__1dfT6",navUl:"Navbar_navUl__1ilzm",navUlLi:"Navbar_navUlLi__3HcLs",navUlLiA:"Navbar_navUlLiA__2_JKe"}},function(e,t,n){e.exports={tagContainer:"ToxBlock_tagContainer__1DhrR",tagLineBreak:"ToxBlock_tagLineBreak__5_qWD",tagButton:"ToxBlock_tagButton__3-JAJ",tagActive:"ToxBlock_tagActive__1rPLa",postContainer:"ToxBlock_postContainer__3uHbg",titleRow:"ToxBlock_titleRow__xoP01",postTitle:"ToxBlock_postTitle__ssA21",postSubtitle:"ToxBlock_postSubtitle__1CrhM",publishDate:"ToxBlock_publishDate__1XSqa",postTag:"ToxBlock_postTag__HgId4",imageContainer:"ToxBlock_imageContainer__3DPKP",postContentContainer:"ToxBlock_postContentContainer__2rY2X",postImage:"ToxBlock_postImage__1XoU7",textPreviewContainer:"ToxBlock_textPreviewContainer__25Q_e",textPreviewContainerp:"ToxBlock_textPreviewContainerp__2rCUh",bottomRow:"ToxBlock_bottomRow__2kydd",bottomLine:"ToxBlock_bottomLine__A3k7u",page:"ToxBlock_page__2d8Iy",postPageToc:"ToxBlock_postPageToc__3IQpS",tocHide:"ToxBlock_tocHide__1awrS",current:"ToxBlock_current__r-Lg4",toxblockTitle:"ToxBlock_toxblockTitle__1vi2A",toxSpan:"ToxBlock_toxSpan__3khag",blockSpan:"ToxBlock_blockSpan__1RHXR",resourceList:"ToxBlock_resourceList__3Ur4F",toxblockImg:"ToxBlock_toxblockImg__1i6Gd",toxblockPostButtonDiv:"ToxBlock_toxblockPostButtonDiv__nq1xz",toxblockPostButton:"ToxBlock_toxblockPostButton__2VJgw",toxblockAppStatus:"ToxBlock_toxblockAppStatus__3qpY2",toxblockFormControl:"ToxBlock_toxblockFormControl__23f2x",error:"ToxBlock_error__1OWAw",toxblockInput:"ToxBlock_toxblockInput__2WTha",toxblockChart:"ToxBlock_toxblockChart__WC7Rs",toxblockBarLabel:"ToxBlock_toxblockBarLabel__37fZb",toxblockBarNumber:"ToxBlock_toxblockBarNumber__1LXuN",toxblockBar:"ToxBlock_toxblockBar__3uHDc",toxicBar:"ToxBlock_toxicBar__3FGW-",severeToxicBar:"ToxBlock_severeToxicBar__Rs4rq",obsceneBar:"ToxBlock_obsceneBar__3UilE",insultBar:"ToxBlock_insultBar__3VcPK",threatBar:"ToxBlock_threatBar__3MHVB",identityHateBar:"ToxBlock_identityHateBar__3TNM4"}},function(e,t,n){e.exports={background:"SkillsSection_background__5Ej34",sectionContainer:"SkillsSection_sectionContainer__25YXX",sectionSeparator:"SkillsSection_sectionSeparator__39jLV",paragraph:"SkillsSection_paragraph__3RssP",sectionHeader:"SkillsSection_sectionHeader__GKEz4",sectionFooter:"SkillsSection_sectionFooter__1WOiM",modalParagraph:"SkillsSection_modalParagraph__28Laz",fillerContainer:"SkillsSection_fillerContainer__1Hk_F",fillerIcon:"SkillsSection_fillerIcon__1Oi4N"}},function(e,t,n){e.exports={tagContainer:"DoggoSnap_tagContainer__227Zl",tagLineBreak:"DoggoSnap_tagLineBreak__3eTOX",tagButton:"DoggoSnap_tagButton__2PV-W",tagActive:"DoggoSnap_tagActive__3FOMK",postContainer:"DoggoSnap_postContainer__bW-kh",titleRow:"DoggoSnap_titleRow__1sE3R",postTitle:"DoggoSnap_postTitle__2PxxB",postSubtitle:"DoggoSnap_postSubtitle__2I09S",publishDate:"DoggoSnap_publishDate__3qZEe",postTag:"DoggoSnap_postTag__KIyS8",imageContainer:"DoggoSnap_imageContainer__1ZRg_",postContentContainer:"DoggoSnap_postContentContainer__-taWI",postImage:"DoggoSnap_postImage__30BJv",textPreviewContainer:"DoggoSnap_textPreviewContainer__3Qwsm",textPreviewContainerp:"DoggoSnap_textPreviewContainerp__FWCln",bottomRow:"DoggoSnap_bottomRow__3tzIe",bottomLine:"DoggoSnap_bottomLine__21uAe",page:"DoggoSnap_page__O_3ju",postPageToc:"DoggoSnap_postPageToc__2HYj9",tocHide:"DoggoSnap_tocHide__2oNmH",current:"DoggoSnap_current__3B5Lc",title:"DoggoSnap_title__16UNJ",dogImg:"DoggoSnap_dogImg__32BTN",paragraph:"DoggoSnap_paragraph__2T8mx",doggosnapAppStatus:"DoggoSnap_doggosnapAppStatus__1RNwg",hidden:"DoggoSnap_hidden__iNksh",error:"DoggoSnap_error__1iuva"}},function(e,t,n){e.exports={tagContainer:"PostPage_tagContainer__qYPMo",tagLineBreak:"PostPage_tagLineBreak__1IC6n",tagButton:"PostPage_tagButton__2qmq5",tagActive:"PostPage_tagActive__3Jgst",postContainer:"PostPage_postContainer__316tc",titleRow:"PostPage_titleRow__AwHZK",postTitle:"PostPage_postTitle__IzGAu",postSubtitle:"PostPage_postSubtitle__3uhne",publishDate:"PostPage_publishDate__1rkkb",postTag:"PostPage_postTag__10EIS",imageContainer:"PostPage_imageContainer__2Jq8N",postContentContainer:"PostPage_postContentContainer__1Gqlk",postImage:"PostPage_postImage__2P3En",textPreviewContainer:"PostPage_textPreviewContainer__d9FNm",textPreviewContainerp:"PostPage_textPreviewContainerp__3J9h4",bottomRow:"PostPage_bottomRow__xX6ai",bottomLine:"PostPage_bottomLine__ssD5F",page:"PostPage_page__2HpXr",postPageToc:"PostPage_postPageToc__347Nz",tocHide:"PostPage_tocHide__1UCPO",current:"PostPage_current__30PPi"}},,function(e,t,n){e.exports={background:"BioSection_background__W9m7N",sectionContainer:"BioSection_sectionContainer__2tqHC",sectionSeparator:"BioSection_sectionSeparator__1rdRE",paragraph:"BioSection_paragraph__3-qk_",sectionHeader:"BioSection_sectionHeader__3XvMM",sectionFooter:"BioSection_sectionFooter__yf7A2",bioListContainer:"BioSection_bioListContainer__34_Df",bioList:"BioSection_bioList__3j3wi",bioYear:"BioSection_bioYear__jGlor"}},,function(e,t,n){e.exports={tagContainer:"Posts_tagContainer__1c6EE",tagLineBreak:"Posts_tagLineBreak__13ErV",tagButton:"Posts_tagButton__1_jB5",tagActive:"Posts_tagActive__NFg_A",postContainer:"Posts_postContainer__3tRxD",titleRow:"Posts_titleRow__RVrJF",postTitle:"Posts_postTitle__TPiO1",postSubtitle:"Posts_postSubtitle__3RoGp",publishDate:"Posts_publishDate__374vX",postTag:"Posts_postTag__3xtoa",imageContainer:"Posts_imageContainer__2eDc_",postImage:"Posts_postImage__1KIaI",textPreviewContainer:"Posts_textPreviewContainer__1Dyvi",textPreviewContainerp:"Posts_textPreviewContainerp__3rh5n",bottomRow:"Posts_bottomRow__1ibmy",bottomLine:"Posts_bottomLine__1aBoU"}},,,,,function(e,t,n){e.exports={background:"AboutMeSection_background__2YrLH",sectionContainer:"AboutMeSection_sectionContainer__1W1qh",sectionSeparator:"AboutMeSection_sectionSeparator__23zv9",paragraph:"AboutMeSection_paragraph__uxvgw",sectionHeader:"AboutMeSection_sectionHeader__3NtH-",sectionFooter:"AboutMeSection_sectionFooter__g4gT4",imageContainer:"AboutMeSection_imageContainer__24VAl",meImage:"AboutMeSection_meImage__2cmVL",cvButton:"AboutMeSection_cvButton__2_anx",iconContainer:"AboutMeSection_iconContainer__3rjvo",contactIcon:"AboutMeSection_contactIcon__3oGRW"}},function(e,t,n){e.exports={background:"BlogSection_background__yPqDo",sectionContainer:"BlogSection_sectionContainer__1JSII",sectionSeparator:"BlogSection_sectionSeparator__1zlA0",paragraph:"BlogSection_paragraph__qhF83",sectionHeader:"BlogSection_sectionHeader__YuCFs",sectionFooter:"BlogSection_sectionFooter__15nG8",carouselContainer:"BlogSection_carouselContainer__3zp64",cardLink:"BlogSection_cardLink__HkZGa",cardContainer:"BlogSection_cardContainer__19C8y",cardImageGrad:"BlogSection_cardImageGrad__tlIgC",cardImage:"BlogSection_cardImage__3F3r5",cardTitle:"BlogSection_cardTitle__2bU-A",cardText:"BlogSection_cardText__1l2EV",cardPublishDate:"BlogSection_cardPublishDate__3DQ6l"}},function(e,t,n){e.exports={titleSection:"TitleSection_titleSection__1hAM5","darken-banner":"TitleSection_darken-banner__dbzlS",titleContainer:"TitleSection_titleContainer__32iEJ",titleHeading:"TitleSection_titleHeading__3ZS9H","fly-in":"TitleSection_fly-in__3smnK",titleSubheading:"TitleSection_titleSubheading__13Vhc",tsh1:"TitleSection_tsh1__10ZRe","typing-lg":"TitleSection_typing-lg__2n1d4","blink-caret":"TitleSection_blink-caret__8hWhO","typing-sm":"TitleSection_typing-sm__2jHyg",tsh2:"TitleSection_tsh2__1elTv",tsh3:"TitleSection_tsh3__2xhBY",moreButton:"TitleSection_moreButton__3ux1l","button-appear":"TitleSection_button-appear__1bpax"}},function(e,t,n){e.exports={tagContainer:"Diarysta_tagContainer__xy-an",tagLineBreak:"Diarysta_tagLineBreak__e-2Xl",tagButton:"Diarysta_tagButton__Rpb3s",tagActive:"Diarysta_tagActive__1qwr1",postContainer:"Diarysta_postContainer__2WL2T",titleRow:"Diarysta_titleRow__1x_s6",postTitle:"Diarysta_postTitle__2boWT",postSubtitle:"Diarysta_postSubtitle__6OvEl",publishDate:"Diarysta_publishDate__GP_iP",postTag:"Diarysta_postTag__2vG44",imageContainer:"Diarysta_imageContainer__3cMrn",postContentContainer:"Diarysta_postContentContainer__9E3_Q",postImage:"Diarysta_postImage__3zjFV",textPreviewContainer:"Diarysta_textPreviewContainer__26nnM",textPreviewContainerp:"Diarysta_textPreviewContainerp__gPaTJ",bottomRow:"Diarysta_bottomRow__2i7hF",bottomLine:"Diarysta_bottomLine__1yaUv",page:"Diarysta_page__2yaGv",postPageToc:"Diarysta_postPageToc__Gs6XY",tocHide:"Diarysta_tocHide__3IH_L",current:"Diarysta_current__opKsx",title:"Diarysta_title__2o5Qp",diarystaTitle:"Diarysta_diarystaTitle__13vYl",diarystaImg:"Diarysta_diarystaImg__29Z9h"}},function(e,t,n){e.exports={navLinkContainer:"TitleSectionNavLinks_navLinkContainer__1Z8PS",navLink:"TitleSectionNavLinks_navLink__gTfbN"}},function(e,t,n){e.exports={cardLink:"SkillsSectionCard_cardLink__1PX61",cardImageGrad:"SkillsSectionCard_cardImageGrad__1ERei",cardImage:"SkillsSectionCard_cardImage__25WbJ",cardText:"SkillsSectionCard_cardText__1XKiC",modalContainer:"SkillsSectionCard_modalContainer__1W6rd",skillIcon:"SkillsSectionCard_skillIcon__3bPBR",modalTitle:"SkillsSectionCard_modalTitle__2aarh"}},,function(e,t,n){e.exports={sectionContainer:"ContactSection_sectionContainer__14WRo",sectionSeparator:"ContactSection_sectionSeparator__1viZs",paragraph:"ContactSection_paragraph__1QSGz",sectionHeader:"ContactSection_sectionHeader__1KB-8",sectionFooter:"ContactSection_sectionFooter__3uOUr",contactIcon:"ContactSection_contactIcon__1zkJ4",contactIconLink:"ContactSection_contactIconLink__2e3bU"}},function(e,t,n){e.exports={background:"Footer_background__20CJN",sectionContainer:"Footer_sectionContainer__18u_H",sectionSeparator:"Footer_sectionSeparator__I0tWl",paragraph:"Footer_paragraph__TEUer",sectionHeader:"Footer_sectionHeader__Bjn5V",sectionFooter:"Footer_sectionFooter__3OggR",contactIcon:"Footer_contactIcon__2Mzf-"}},function(e,t,n){e.exports={titleSection:"TitleSection_titleSection__2u5j_","darken-banner":"TitleSection_darken-banner__PLl9g",titleContainer:"TitleSection_titleContainer__2K1NK",titleHeading:"TitleSection_titleHeading__dgfxI","fly-in":"TitleSection_fly-in__1TU6_",titleSubheading:"TitleSection_titleSubheading__2g-6K",tsh1:"TitleSection_tsh1__1wyCV","typing-lg":"TitleSection_typing-lg__2iWnu","blink-caret":"TitleSection_blink-caret__Ltugk","typing-sm":"TitleSection_typing-sm__3cQVl",tsh2:"TitleSection_tsh2__JdQar",tsh3:"TitleSection_tsh3__1r79M",moreButton:"TitleSection_moreButton__2UZTA","button-appear":"TitleSection_button-appear__lYq4v",appear:"TitleSection_appear__xDOJD"}},function(e,t,n){e.exports={tagContainer:"VecBrain_tagContainer__1PecW",tagLineBreak:"VecBrain_tagLineBreak__1aknI",tagButton:"VecBrain_tagButton__1sMxE",tagActive:"VecBrain_tagActive__2i69T",postContainer:"VecBrain_postContainer__1er-j",titleRow:"VecBrain_titleRow__UfMDA",postTitle:"VecBrain_postTitle__238CC",postSubtitle:"VecBrain_postSubtitle__18BD_",publishDate:"VecBrain_publishDate__IQFNV",postTag:"VecBrain_postTag__1xwEk",imageContainer:"VecBrain_imageContainer__GIkGS",postContentContainer:"VecBrain_postContentContainer__3XsIy",postImage:"VecBrain_postImage__1Zh2q",textPreviewContainer:"VecBrain_textPreviewContainer__1uzhR",textPreviewContainerp:"VecBrain_textPreviewContainerp__1Ca6J",bottomRow:"VecBrain_bottomRow__2ySUi",bottomLine:"VecBrain_bottomLine__2hMxF",page:"VecBrain_page__3UtEl",postPageToc:"VecBrain_postPageToc__qPPZN",tocHide:"VecBrain_tocHide__1A5S6",current:"VecBrain_current__2FC2u",title:"VecBrain_title__13QgR",vecBrainTitle:"VecBrain_vecBrainTitle__A7iFS",vecBrainImg1:"VecBrain_vecBrainImg1__3ZRuw",vecBrainImg2:"VecBrain_vecBrainImg2__3fyWT"}},,,,function(e,t,n){e.exports={background:"ProjectSection_background__WSxLh",sectionContainer:"ProjectSection_sectionContainer__1_Djd",sectionSeparator:"ProjectSection_sectionSeparator__1IgQJ",paragraph:"ProjectSection_paragraph__3b7Lj",sectionHeader:"ProjectSection_sectionHeader__2Rc_l",sectionFooter:"ProjectSection_sectionFooter__1Vrl2"}},function(e,t,n){e.exports={cardLink:"ProjectSectionCard_cardLink__3Edl3",cardImageGrad:"ProjectSectionCard_cardImageGrad__2KdSA",cardImage:"ProjectSectionCard_cardImage__25QJk",cardText:"ProjectSectionCard_cardText__34N9E"}},,,,,,,,,,function(e,t,n){e.exports={page:"Home_page__2NQ-C"}},,function(e,t,n){e.exports={page:"Blog_page__1JTSz"}},,,,,,,,,,function(e,t,n){},function(e,t,n){},,function(e,t,n){var a={"./analysis.svg":57,"./chemistry.svg":58,"./cloud.svg":59,"./dumbbell.svg":60,"./guitar.svg":61,"./language.svg":62,"./machine-learning.svg":63,"./sky.svg":64,"./teaching.svg":65,"./webdev.svg":66};function o(e){var t=i(e);return n(t)}function i(e){if(!n.o(a,e)){var t=new Error("Cannot find module '"+e+"'");throw t.code="MODULE_NOT_FOUND",t}return a[e]}o.keys=function(){return Object.keys(a)},o.resolve=i,e.exports=o,o.id=56},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/analysis.36227fc2.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/chemistry.e9566024.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/cloud.4410a686.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/dumbbell.24eb3b33.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/guitar.fcebbb0b.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/language.42634d3e.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/machine-learning.deed77c1.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/sky.452a1848.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/teaching.48fbe7ef.svg"},function(e,t,n){"use strict";n.r(t),t.default=n.p+"static/media/webdev.ef749d5f.svg"},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Effectiveness of Decision Tree Ensembles","Why tree-based algorithms perform so well on real-world tabular data",new Date("2024-09-09"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/decision_tree_forest.png","Sometimes you just can't see the forest for the Decision Trees.",["Data Science & AI/ML","Learning"],"**TL;DR:** Decision Tree Ensembles like XGBoost and Random Forest consistently outperform other algorithms on real-world business data. I argue that this is largely due to their non-parametric decision functions based on discrete splits rather than distances in feature space, making them more robust to noise and outliers, while effectively capturing complex non-linear relationships and feature interactions.\n\nAll the code used to generate the data and plots in this post can be found in this [Github repo](https://github.com/Pascal-Bliem/effectiveness-of-decision-tree-ensembles).\n\nI will discuss the following aspects in detail here:\n\n1. [The Problem with real-world data](#the-problem-with-real-world-data)\n2. [The strengths of Decision Tree Ensembles](#the-strengths-of-decision-tree-ensembles)\n3. [The Decision Function](#the-decision-function)\n4. [A Note on Overfitting](#a-note-on-overfitting)\n5. [Comparison of Decision boundaries with parametric models](#comparison-of-decision-boundaries-with-parametric-models)\n6. [Influence of outliers](#influence-of-outliers)\n7. [Influence of categoric variables](#influence-of-categoric-variables)\n8. [Alternatives to Trees](#alternatives-to-trees)\n9. [Conclusion](#conclusion)\n\n### The Problem with real-world data\n\nWhen I worked at an insurance company, developing machine learning models to predict customer churn and make product recommendations, I noticed an interesting trend: **decision tree ensembles** like [Random Forest](https://en.wikipedia.org/wiki/Random_forest) and [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) (e.g. [XGBoost](https://xgboost.readthedocs.io/), [CatBoost](https://catboost.ai/), [LightGBM](https://lightgbm.readthedocs.io/)) consistently outperformed other algorithms on real-world data. This wasn\u2019t just a small edge in performance \u2014 these models seemed to thrive where others struggled, handling the complexities of business data much more effectively than alternatives like logistic regression, support vector machines (even with non-linear kernels), and simple neural networks (not taking into account modern nets more suitable for tabular data, like [NODE](https://arxiv.org/abs/1909.06312) or [TabNet](https://arxiv.org/abs/1908.07442)).\n\nThe key difference I observed between the datasets we used in the insurance industry and those often seen in academic exercises or machine learning competitions was the nature of real-world data, which was...well...kinda messy. Unlike carefully prepared, well-balanced datasets, real-world data had certain problematic characteristics that made many traditional algorithms perform quite poorly. These issues included:\n\n- **Mostly tabular data**, inhomogeneously collected from various input sources like application databases and data warehouses.\n- **Many categorical features**, often non-ordinal but still highly relevant.\n- **Significant amounts of missing data**, requiring imputation strategies.\n- **High levels of noise** in the data due to errors in data entry or obscure business logic embedded in ETL pipelines.\n- **Skewed distributions** with relevant outliers that shouldn\u2019t simply be discarded.\n- **Feature correlation and collinearity**, sometimes strong, but not so strong that you'd just want to drop some of the features.\n- **Class imbalances**, both in target variables and feature categories.\n\nAnd I also had somewhat of an intuition that there were very relevant but small **pockets of relevant feature combinations** in the high-dimensional feature space, which contained just the information we needed for some edge-case predictions.\n\n![Real-world data can be a mess sometimes.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/messy_data.png)\n\n### The strengths of Decision Tree Ensembles\n\nWhere traditional algorithms struggled with these challenges, **decision tree ensembles** handled them much more effectively. I believe this is largely due to the **non-smooth, non-parametric nature** of decision tree decision functions, which are based on discrete, hierarchical **splits** rather than smooth, parametrized functions sensitive to distances in feature space. They just don't make an assumption regarding the distribution of the data. This provides several advantages:\n\n- **Robustness against noise**: Decision trees tend to ignore irrelevant noise by splitting only on the most informative features.\n- **Handling of skewed distributions and outliers**: Trees can isolate outliers into their own branches without being negatively affected.\n- **Capturing complex, non-linear relationships**: The hierarchical structure of trees allows them to model intricate interactions between features and the target variable.\n- **Feature interaction and collinearity**: Decision trees can naturally capture correlations between features and are resilient to collinearity, as they split features independently and don't depend on a combination of coefficients across all features.\n- **Native handling of categorical features**: Trees can create splits based on the category values, avoiding the need for complex and/or dimensionality-increasing preprocessing like one-hot encoding.\n- **Handling of large and imbalanced datasets**: Ensemble methods scale well and can address class imbalances, especially with Gradient Boosting, where each estimator corrects errors from previous iterations, helping them pay more attention to harder-to-classify instances, often from minority classes.\n- **Lower requirement for feature engineering**: The model makes fewer assumptions about the underlying data, reducing the need for manual feature engineering, which can be useful if the business logic generating the data is unclear and you don't want to let your potentially incorrect assumptions influence the model negatively.\n- **Reduced overfitting through ensembling**: While individual decision trees are prone to overfitting, ensemble methods like Random Forests and Gradient Boosting combine many weak learners to improve generalization.\n\nIn short, many of the advantages of decision tree ensembles directly addressed the issues present in the real-world data I worked with.\n\nExpressed in words, this is all you need to know. If you're convinced already, you can stop reading here. If you are, however, curious about understanding why what I mentioned above is the case, please keep reading. In the following, I will talk a bit more about how the decision functions of decision trees and provide some examples to for some of the points I mentioned above, and demonstrate their strengths.\n\n### The Decision Function\n\nLet's have a look at how tree models make their decisions. We'll take the CART algorithm (Classification and Regression Trees), as implemented in [scikit-learn](https://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart) as an example (though there are other algorithms as well, e.g. [ID3](https://en.wikipedia.org/wiki/ID3_algorithm), [C4.5](https://en.wikipedia.org/wiki/C4.5_algorithm), [CHAID](https://en.wikipedia.org/wiki/Chi-square_automatic_interaction_detection)). We'll focus primarily on classification here.\n\nDecisions are made through a series of binary splits based on feature values. These splits are determined by finding the threshold that best separates the data at each node. The goal is to create homogeneous subsets with respect to the target variable. The tree is constructed by recursively splitting the data, and the decision function at each node is non-parametric, meaning it doesn\u2019t rely on a fixed equation or coefficients as in linear models. The CART algorithm is used for both classification and regression tasks. The algorithm operates by selecting splits that minimize the impurity (or maximize the information gain) at each node, which leads to the following decision function:\n\n$f(x) = \\sum_{i=1}^{M} I(x \\in R_{i}) * c_{i}$\n\nWhere $M$ is the number of terminal nodes (or leaves) in the tree, $I(x \\in R_{i})$ is an indicator function that checks if the input feature $x$ falls within the region $R_{i}$, which corresponds to the terminal node $i$, and $c_{i}$ is the predicted outcome (e.g., class label or regression output) for all inputs that reach node $i$. You can see right away that there no parameters/coefficients that need to interact with the value of the features, it's just important weather the combination of features falls into the region of a particular terminal node or not.\n\nSo how does the algorithm decide what series of splits defines a terminal node's region in feature space? The CART algorithm makes splits in a way that tries to minimize a loss function during training, e.g. Gini impurity for classification or the mean squared error (MSE) for regression. Let's focus on classification here. The Gini impurity at a node $i$ is\n\n$Gini(i) = 1 - \\sum_{c=1}^{C} p_{c}^{2}$\n\nWhere $C$ is the total number of classes and $p_{c}$ is the proportion of samples of class $c$ at node $i$. You can see that the more samples of a class are at the node, the \"cleaner\" the split, the lower the impurity. At each split, the algorithm selects the feature and threshold within the feature that yield the lowest impurity (or highest information gain). It selects the split that best separates the classes from each other. And it keeps hierarchically adding more of these best splits, building a tree of splits until a stopping criterion (e.g. maximal depth of the tree) is met.\n\nIn the figure below, you can see how such a decision tree is created on the example of classifying the [Iris flower data set](https://www.kaggle.com/datasets/uciml/iris). You can see how each consecutive node makes a split on a given feature and threshold to separate the remaining data in the best possible way.\n\n![Decision tree as a graph, showing how the data is split on a feature and threshold at each node.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/decision_tree_graph.png)\n\nYou can see how this behavior results in all the advantages we've discussed above. The resulting decision boundary is not constraint by a particular functional form, since with every new split, the decision boundary can \"take a new direction\" in the feature space. At which value the split threshold is set is only determined by which value results in the best class separation, not by how the feature values are distributed above or below the threshold. Basically, it can partition the feature space into arbitrarily shaped regions, which let's it deal with complicated non-linear relationships or non-ordinal categorical features. It deals with skewed distributions and outliers by isolating them into their own branches without affecting the rest of the model. It deals with noise or collinearity between features by simply ignoring them if those features don\u2019t provide useful information for reducing impurity.\n\nIn the figure below, you can see a simple 3D example of how the feature spaces is divided into sharply cut regions that separate the different classes. One can see that the decision boundaries (meaning the boundaries beyond which the algorithm would predict one class or another) are not bound by a specific functional form, they can be cut however the algorithms finds it most effective.\n\n![The Decision Tree cuts the feature spaces into regions that separate the different classes. The boundaries between those regions are called decision boundaries.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/tree_decision_boundaries.png)\n\n### A Note on Overfitting\n\nThe discussion so far was only about a single decision tree. We could just keep growing the tree based on our training set until it perfectly covers every little edge-case in our training data, and classifies it perfectly. As you can imagine, it would probably generalize very poorly on new data that it hasn't seen during the training process - a phenomenon called overfitting. Combining many weak decision trees (of very limited size) into a large ensemble remedies this problem because each individual tree can't become complex enough to overfit as much, but combined together, they can still make strong predictions.\n\nIn the figure below, you can see how that works. We can see the learning curves (training & validation accuracy vs. number of training examples) of a single decision tree (on the left) vs. a Random Forest consisting of 300 Decision Trees (on the right). You can see that while both models achieve perfect accuracy on the training data, the single decision tree has a much worse validation accuracy, even when more training data is added. That is an indication that it overfits on the training data and does not generalize well to the validation data. For the Random Forest, this problem is much less pronounced.\n\n![Learning curves that show how a single Decision Tree overfits training dat much more than an Ensemble.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/learning_curve.png)\n\nIn the following, let's deep-dive into a few of the advantages we've discussed so far and demonstrate them with code examples and visualizations.\n\n### Comparison of Decision boundaries with parametric models\n\nNow, let's contrast this other models like logistic regression, support vector machines (SVMs), and neural networks, which rely on parametric decision functions. These models use a fixed mathematical form to map input features to output predictions. The \"parameters\" here are e.g. the coefficients in a logistic regression, the shape parameter in a SVM radial basis function kernel, or the weights in a neural network. To be fair, neural networks or SVM kernels like radial basis functions or polynomials can in theory approximate any continuous function for a given input range, but in practice, when training them with a finite amount of hyperparameters to tune, the particular mathematical form used by a model will still strongly determine how its decision boundaries are going to look like. This makes the encoding of complicated non-linear relationships much harder than with decision trees, and correlated features can distort the parameter estimates.\n\nI'll try to demonstrate that in the figure below, simulating a binary classification problem (meaning there are only 2 classes, red and blue in the figure) with 2 feature which have a complex non-linear relationship with the target variable (the class). The data distribution doesn't follow a regular mathematical form, because I just drew it by hand. It should symbolize the notion of class \"pockets\" or \"corridors\" in feature space. Since the problem with only 2 classes and 2 features is a very simple one, we'll try to solve it with 4 very simple classifiers: A Random Forest with 30 trees, a Logistic Regression, a Support Vector Machine (SVM) with a 5th degree polynomial kernel and a Multi-Layer Perceptron (MLP) neural network with 2 hidden layers that each have 10 neurons.\n\n![Decision boundaries for different classifiers on complex, non-linear data.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/non_linear_decision_boundaries.png)\n\nAs you can see, the Logistic Regression fails miserably, as it's a linear model and the relationship between the data and the class isn't linear at all. The SVM and MLP are a bit better at fitting to the data but also have trouble bending their functionally defined decision boundaries to the data. The Random Forest, however, can easily cut out the pockets and corridors that correctly classify the data as it can change the direction of its decision boundaries however it wants with every new split. To be fair, if I would have chosen more complex models, e.g. a SVM with a scaled radial basis function kernel or a MLP with more layers or more neurons per layer, they would also have done a good job at classifying the data. But since this is a very simple problem, I also want to keep the models simple to make my point. The Random Forest with only 30 trees is also relatively simple (in real-world problems often hundreds of tress are used) but it already does a very good job here.\n\n### Influence of outliers\n\nParametric models are also heavily influenced by outliers because the entire dataset is used to optimize the parameters (rather than splitting on one feature threshold at a time as in decision trees). Outliers can skew the decision boundary, leading to poor generalization. For example, in SVMs, outliers near the decision boundary can drastically affect the placement of the support vectors, leading to overfitting. In a linear model, an outlier would pull the decision boundary away from where it should really be.\n\nI tried to simulated such a situation in the figure below. This time I created some non-liner data that still has more or less clearly defined boundary. But then I also added some strong outliers for each class that lie on the wrong side of that boundary. We're again using the same classifiers as before: Random Forest, Logistic Regression, SVM, and MLP. You can see how Logistic Regression and the SVM fail to adjust their decision boundaries to the form of the data that is now distorted by the outliers. The MLP actually does a pretty good job at \"cutting off\" the outlier sections from the bulk of the data, but the Random Forest does an even better job at it, as its decision boundaries can change directions more flexibly.\n\n![Decision boundaries for different classifiers on data with strong outliers on the \"wrong\" side of the boundary.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/outlier_decision_boundaries.png)\n\n### Influence of categoric variables\n\nFurthermore, parametric models typically require that categorical features be encoded into numeric representations (e.g. label or one-hot encoding). For label encoding (just representing the classes/category values as consecutive numbers) the model would assume that the distances between the categories' values are meaningful. This can be problematic even for ordinal categories where the distance between values isn't really clear, and complete nonsense for non-ordinal categories where there's no meaningful distance between values. Non-ordinal categories can be one-hot encoded (creating a binary column for each category value), but this can introduce high-dimensionality and sparsity, especially with features that have many category values, making these models less efficient on such data (see [The Curse of Dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)).\n\nTo avoid dimensionality explosion, non-ordinal categories sometimes still have to be label-encoded. This is what I simulate in the figure below. We still have a binary classification problem with 2 features. One of the features is continuous and correlated with the class. But now the other one is a label-encoded, non-ordinal category. I synthesized that data by creating 10 categories (1 to 10), and then randomly assigning half of the categories with a higher probability to class red and the other half with a higher probability to class blue. This means that the numerical distance between these category values is not meaningful for predicting the class. However, it is very predictive if a data point is of a certain category or not. You can see in the figure how the Logistic Regression, SVM, and MLP don't do a great job at at representing this binary question of \"is this data point of a certain category or not\" and are rather confused by trying to assign meaning to the distance between the category labels, and fit their decision boundaries accordingly. The Random Forest, on the other hand, can just cut decision boundary corridors through the feature space that correspond very well to just picking a particular category value; hence, it performs much better at this classification with label-encoded non-ordinal categories.\n\n![Decision boundaries for different classifiers on data with a non-ordinal categorical feature.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/effectiveness-of-decision-tree-ensembles/categorical_decision_boundaries.png)\n\n### Alternatives to Trees\n\nThere are none! All hail the mighty tree! JK, jokes aside, there are a couple of promising alternatives in the realm of neural networks. As we've seen in the examples above, the MLP was usually close behind the Random Forest in terms of performance, an could have easily performed even better if we would have made the model a bit more complex. Neural networks, with their large amount of parameters, can be very flexible and great at capturing complex relationships in data, though they're still less resilient to extreme noise and strongly skewed distributions. However, many of the neural networks used on tabular data often also kind of imitate the behavior of decision trees or even use decision trees directly. A few noteworthy mentions are:\n\n- **[TabNet](https://arxiv.org/abs/1908.07442):** uses a combination of attention mechanisms and decision steps to learn from tabular features in a way that mimics decision trees.\n- **[Neural Oblivious Decision Ensembles (NODE)](https://arxiv.org/abs/1909.06312):** each layer in NODE is composed of differentiable versions of decision trees, allowing it to learn hierarchical feature representations similar to how a decision tree would.\n- **[DeepGBM](https://www.microsoft.com/en-us/research/uploads/prod/2019/08/deepgbm_kdd2019__CR_.pdf):** uses Gradient Boosting Machines (like LightGBM or XGBoost) to learn the initial data representations, and then feeds those into a neural network for further fine-tuning.\n\nAlso the Transformer architecture, which has been driving the development of large language models in recent years, has been tried out on tabular data. The **[Feature Tokenization Transformer (FT-Transformer)](https://arxiv.org/pdf/2106.11959)** uses self-attention mechanisms to learn from tabular data by encoding each feature as a token and applying attention to capture feature dependencies.\n\nAs is usual for large neural networks with complex architectures, these networks often need extensive tuning of hyperparameters and the training process itself to be trained well, and also require much more computational resources for doing so. And they still don't universally outperform Decision Tree Ensembles, which are typically easier to tune and train well and (of course, depending on the size) require less computational resources. Therefore, it is no surprise that Decision Tree Ensembles like XGBoost or Random Forests are still the quick-and-easy solution for a lot of messy, real-world tabular data.\n\n### Conclusion\n\nWe've seen that in the context of real-world tabular data, decision tree ensembles like XGBoost and Random Forest consistently outperform other algorithms, especially when dealing with messy, complex datasets that contain noise, missing values, outliers, and a mix of categorical and continuous variables. Their non-parametric decision functions, based on discrete hierarchical splits rather than smooth mathematical functions, allow them to capture intricate non-linear relationships and feature interactions, while remaining robust against many of the common issues in real-world business data, such as class imbalance, highly skewed data distribution, feature collinearity, and high-cardinality categorical variables .\n\nThese ensembles are also highly effective in addressing the real-world challenges where traditional models like logistic regression, support vector machines, or even basic neural networks may struggle. However, as neural network models like TabNet and NODE continue to evolve, we now have some deep learning alternatives that can also compete in structured data environments. But for most applications involving complex, noisy, or imbalanced tabular data, decision tree ensembles remain a strong, reliable, interpretable, and relatively easy-to-use choice.\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("A Journey of Language Learning: From German to Chinese","How 5 Languages Shaped my Experiences, Perspectives, and my Understanding of Culture and Myself",new Date("2024-08-30"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_cyber.webp","",["Learning","Non-Tech"],"**TL;DR:** In this post I look back on the journey of learning 5 different languages, from my native German, over English in school, to Indonesian, Spanish, and Chinese as an adult. I compare the structural differences from the perspective of a learner, my evolution of learning methods and resources, cultural insights and immersion, and how each language added something new to my personality.\n\nI'll talk about the following topics:\n\n1. [Why even create such a linguistic chaos in my head](#why-even-create-such-a-linguistic-chaos-in-my-head)\n2. [Structural Differences: From German to Chinese, from Europe to Asia](#structural-differences-from-german-to-chinese-from-europe-to-asia)\n3. [Learning Methods and Resources](#learning-methods-and-resources)\n4. [Cultural Insight Through Language and Multiple Personalities](#cultural-insight-through-language-and-multiple-personalities)\n5. [Travel and Immersion: How Speaking the Language Makes a Difference](#travel-and-immersion-how-speaking-the-language-makes-a-difference)\n6. [Conclusion](#conclusion)\n\n### Why even create such a linguistic chaos in my head\n\nLet me begin with a brief personal note\u2014if you're not particularly interested in my backstory, feel free to [skip ahead to the next section](#structural-differences-from-german-to-chinese-from-europe-to-asia). The rest of the post will still make sense without diving into these details. For those who are curious about how I ended up filling my head with an array of languages, here\u2019s the story.\n\nI grew up speaking German, of course, but like most native speakers, I never thought much about the complexity of the language. I learned it naturally as a child, and it was simply the way I communicated. It wasn\u2019t until much later that I realized just how intricate German grammar really is.\n\nDuring my childhood, it felt sufficient to express my basic needs without giving much thought to other languages.\nSpanish had an early presence in my life, as well. My grandmother lived in Spain, and my family often traveled there for vacations. Despite this, I never thought of Spanish as anything more than German with different sounds. I was curious about the language, but no one encouraged me to pursue it, so I never did.\n\nIt wasn\u2019t until high school, when I started learning English, that I began to see languages differently. For the first time, I understood that languages have their own structures, their own unique ways of expressing things. You can say something in German that doesn\u2019t quite translate into English, and vice versa. I also noticed that the structure of a language seemed to reflect the character of its speakers. Both German and Germans being a bit too dry and exact for their own good, for example.\n\nHowever, like for many people, my experience with language learning in school wasn\u2019t particularly inspiring. Language classes often felt more focused on passing exams than on actual communication, so I didn\u2019t enjoy it much at the time. That said, I had plenty of exposure to English outside the classroom. I took other classes in English, and when I started playing guitar as a teenager, I was eager to understand the lyrics of the songs I played and watched tons of guitar tutorial videos in English. Most of the internet content I consumed was also in English\u2014there simply wasn\u2019t as much available in German back then (yes, feeling old writing this). Without much deliberate effort, I found myself absorbing English through these activities, almost passively assimilating it.\n\nFrench, on the other hand, was a very different experience. I studied it for four years in high school, but with little natural exposure to the language and a series of poor teachers, it became a source of frustration. My classes were filled with rote memorization of vocabulary and verb conjugations, and I ended up disliking the language entirely. At the time, I convinced myself that I was simply not good at learning languages\u2014reinforced by the stereotype that one is either good at STEM subjects (which I was) or languages, but not both.\n\nFor a long time after high school, I didn\u2019t seriously engage with language learning. I still had a lingering desire to learn Spanish, and after visiting Taiwan, I also became interested in Chinese. However, my early attempts didn\u2019t go very far. I took a few university classes but made little progress, and my motivation was low. I assumed I just wasn\u2019t suited for learning languages.\n\nEverything changed when I moved to Indonesia after finishing my master\u2019s degree. I was there to volunteer and take online programming courses as I transitioned from materials science to software engineering. At first, I struggled to communicate because many of the people I met didn\u2019t speak much English. If I wanted to make the most of my time there and have meaningful conversations, I needed to learn Indonesian.\n\nTo my surprise, Indonesian turned out to be much easier than I expected. The language is relatively simple, and the people I met were friendly and eager to chat. Within three months, I could hold basic conversations, and after six months, I felt comfortable discussing most day-to-day topics. More importantly, this was the first time I experienced language learning as a tool for real communication, not just as an academic exercise. It was a turning point for me. Before, I had never been particularly interested in languages for their own sake. But learning Indonesian allowed me to connect with people, make friends, and gain insight into a culture that I wouldn\u2019t have accessed otherwise. **This realization made all the difference**.\n\nAfter that experience, I decided to pick up Spanish again. This time, I approached it differently\u2014self-study, getting a private tutor (who became a wonderful friend as well), and spending time in Spanish-speaking countries. I traveled to Spain and several Latin American countries, practicing the language as much as I could. It felt rewarding in a way that classroom learning never had, and I began to appreciate how much more immersive travel can be when you can speak the local language.\n\nWith Indonesian and Spanish under my belt, I thought I was ready for the next challenge: Chinese. I moved to Taiwan and began learning, but the experience was much more difficult than I anticipated. Chinese, with its characters and tonal system, presented obstacles I hadn\u2019t faced with the other languages. It took a long time before I started making any real progress, and the early stages were filled with frustration. However, after six months, I can understand quite a bit and hold simple conversations. It\u2019s still a work in progress, but I\u2019m beginning to experience the same sense of achievement that motivated me in Indonesia.\n\nSo why do I continue to add to the linguistic chaos in my head? It\u2019s not because I love languages themselves\u2014I\u2019m not particularly interested in studying grammar for the sake of it. But languages are tools of communication. They allow me to connect with people, understand new cultures, and see the world from different perspectives. And while the process can be difficult, it just feels so rewarding to work really hard for it, invest a lot of effort into it, and then see it paying off with things as simple as a little chat with an old lady at a bus stop.\nThat\u2019s my story. Now, let\u2019s dive into the more technical aspects of the languages I\u2019ve learned and the methods I used.\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_taipei.webp)\n\n### Structural Differences: From German to Chinese, from Europe to Asia\n\nIn this section, I want to explore how the languages I\u2019ve learned\u2014ranging from European languages like German, English, and Spanish to non-European ones like Indonesian and Chinese\u2014differ in terms of grammar, vocabulary, pronunciation, and writing systems. What emerges are some clear contrasts: grammatical complexity versus simplicity, flexible versus fixed syntax, and how context plays an enormous role in certain languages. We'll also see how intuition can help when learning related languages but leads to confusion when tackling those from entirely different language families.\n\nFirst, let\u2019s begin with the European languages. They serve as a useful reference point for me personally, since this post is written in one, and they\u2019re the linguistic environment I grew up in. European languages, like German, English, and Spanish, are notorious for their grammatical complexity (okay, English isn\u2019t that bad). They come with a wide range of tenses, grammatical genders, plurals, cases, and modes, all of which must be carefully navigated using conjugations, declensions, and a myriad of pronouns. In German, for example, you have to keep track of four grammatical cases (nominative, accusative, dative, and genitive), each influencing not only articles but also adjective endings and pronoun forms. Spanish and German also use grammatical gender (masculine, feminine, and in German, neuter), meaning nouns and their corresponding articles and adjectives must agree in gender, which adds another layer of complexity.\n\nFor those of us who grew up speaking these languages, we rarely think about these rules explicitly. However, when you start learning a language like Indonesian or Chinese (when I speak of Chinese here, I mean Mandarin Chinese), it becomes painfully clear how overcomplicated European languages can be. In Indonesian, for instance, there are no tenses or conjugations\u2014verbs don\u2019t change forms, and grammatical gender doesn\u2019t exist. Similarly, in Mandarin, verb forms remain constant regardless of tense, and particles like \u4e86 or \u904e simply indicate time. These differences highlight how European languages rely heavily on morphology, while other languages achieve clarity through simpler systems like context or word order.\n\nInterestingly, this complexity isn't just confined to Germanic or Romance languages; it\u2019s also found in other Indo-European languages, including those spoken in South Asia. Although I don\u2019t speak any Indian languages, I\u2019ve read some Sanskrit and Pali while studying Buddhist texts, and I've had conversations with Hindi and Nepali speakers about their languages. These languages, too, share many grammatical features with European languages, reinforcing the idea that such complexity runs deep within the Indo-European family.\n\nBut why do European languages need this grammatical complexity? One reason is that it allows for a great deal of flexibility in syntax. For example, in German, the different cases (nominative, accusative, dative, genitive) make it possible to reorder the elements of a sentence in ways that would be impossible in English, which relies on a fixed word order. This flexibility means that fewer things need to be referred to explicitly through context. The grammar handles a lot of the work. But this flexibility comes at a cost. The more flexible the syntax, the more intricate the grammar tends to be. And hence, the more frustration for learners (see Mark Twain\u2019s [The Awful German Language](https://en.wikipedia.org/wiki/The_Awful_German_Language)).\n\nWhen I first started learning English, I had no idea how much easier it was for me compared to someone from a non-Germanic language background. English and German share common roots, which means that many words and grammatical structures are familiar. Vocabulary like \"water\" and _Wasser_, or \"house\" and _Haus_, make the transition smoother. Even Spanish, despite being a Romance language, didn\u2019t feel entirely foreign. Besides being exposed to its sound a lot as a child, the concept of tenses was similar, and there\u2019s also a large amount of shared Latin vocabulary across European languages, thanks to the long-standing influence of Latin in science, religion, and politics. The Norman conquest of England, which introduced a lot of French (and thus Latin) into English, further bridges the gap.\n\nAnd then there\u2019s the issue of pronunciation\u2014English, with its irregular spellings and complex vowel system, often feels like a phonetic puzzle. Spanish, on the other hand, is a breeze in comparison, with a phonetically consistent and straightforward pronunciation system, and German, while having a bit more complex phonetic rules, is at least fairly consistent as well.\n\nNow, let\u2019s shift to Indonesian\u2014a stark contrast to the grammatical maze of European languages. Indonesian is wonderfully simple in its grammar. There are no tenses, no cases, and no gendered nouns. You indicate past or future simply by adding time-related words like _sudah_ (already) for the past or _akan_ (will) for the future. The one notable grammatical feature is _imbuhan_, a system of affixation that modifies the meaning of root words. Even so, this system is intuitive and easy to grasp, and even allows you to sometimes infer words you haven\u2019t learned yet.\n\nThis grammatical simplicity made learning Indonesian a pleasant experience, especially when compared to the headache-inducing German grammar that I now find is unnecessarily complicated. I\u2019ve seen my Indonesian friends struggle to learn German, bewildered by all the rules and exceptions, and frankly, I understand their frustration. Even Spanish was already more than enough grammar for me to learn, and it pales in comparison to German grammar.\n\nHowever, Indonesian has its own quirks. The language follows a more rigid subject-verb-object (SVO) word order. Some sentences require fixed structures to remain grammatically correct, particularly when using words like _yang_ (which/that). There\u2019s also a vast array of synonyms due to the influence of multiple languages\u2014Sanskrit, Arabic, Portuguese, Dutch, and, more recently, English. This means that word choice often depends heavily on the context, varying widely between casual conversation, formal speech, and written literature. Another advantage of Indonesian is that it uses the Latin script, which is very convenient for someone like me, coming from a European language background. The pronunciation is also quite straightforward, with few surprises or tongue-breaking sounds for a German speaker.\n\nWhen I turned to Mandarin Chinese, I found a language that operates on entirely different principles. Mandarin has very little in the way of the grammatical complexity found in European languages\u2014no tenses, no plurals, no cases, and no conjugations. Instead, it relies heavily on a strict word order to convey meaning. This is paired with a set of words and particles (like \u4e86, \u7684, \u9084, \u662f, \u4e5f, \u90fd, \u6703, \u5c31, \u624d, \u53ef, \u8d77, \u51fa, to name just a few) that modify the meaning of words in ways that are initially very confusing for someone who\u2019s linguistic intuition is based on the structures of European languages. The same particles and the words they act on can have vastly different meanings depending on the context, adding a layer of complexity that takes a lot of time to get used to (still working on it\u2026).\n\nIn Chinese, context is crucial, more so than in any other language I\u2019ve encountered. Many Chinese words sound the same but have different meanings (homophones), and often the only way to differentiate them is through the surrounding words. This is especially true in spoken Chinese, where you can rarely say just one word and be understood. You almost always need to combine words into phrases or sentences to clarify meaning. The tonal system also plays a major role in distinguishing words that would otherwise be homophones. Unlike European languages, where pitch is used for emphasis or emotional tone, in Chinese, pitch is the meaning.\n\nFor me, the tonal system is by far the most challenging aspect of learning Chinese. My ear just doesn\u2019t naturally pick up the changes in pitch. Even when I improvise on guitar, I focus more on finger movements than on pitch, and it turns out that this lack of pitch awareness carries over into my Chinese pronunciation. My tones are still shaky, and it\u2019s hard for me to detect them when others speak. This is compounded by the fact that intonation and stress, which we use freely in European languages, serve an entirely different function in Mandarin. So in addition to getting used to how variation in pitch is used as an important part of every word in Mandarin, one also has to _stop_ using it for certain types of emphasis, the way it's used in European languages, or one might accidentally call someone's mother a horse.\n\nFinally, there\u2019s the Chinese writing system, which is completely different from the Latin alphabet. Mandarin uses characters, each representing a word or concept, rather than sounds. Unlike alphabetic scripts, these characters don\u2019t directly convey how they\u2019re pronounced. While many characters include components that hint at pronunciation or meaning, they don\u2019t give you a complete picture, making it necessary to memorize thousands of characters and their pronunciation. Surprisingly, this hasn\u2019t been the hardest part of learning Chinese for me\u2014maybe because I\u2019m quite good at memorizing things. Instead, my greatest difficulties lie in mastering the tones and listening comprehension. At the point of writing this, I\u2019m in Taiwan for half a year already, and I still often have to ask people to repeat something 3 times before I really understand it.\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_peru.webp)\n\n### Learning Methods and Resources\n\n#### Evolution of Learning Methods\n\nThe most important lesson I've learned throughout my language journey is that exposure is key. When you're young, the brain\u2019s plasticity allows you to passively assimilate a language through immersion alone. That\u2019s how I learned English\u2014through classes, media, and casual conversation, all of which seemed to just \u201cstick\u201d without much conscious effort. However, as I got older and tried to learn Spanish and Chinese in university, it became obvious that this passive method wasn't enough. Without a system, I struggled. I didn't have the time or motivation back then to figure out how to learn systematically, but it became clear that a structured approach would be necessary.\n\nMy experience learning Indonesian while living in Indonesia marked a turning point in my approach to languages. Indonesian\u2019s grammar is relatively simple, so I could quickly learn the basics (like imbuhan) from friends and practice in conversation. However, learning new vocabulary from scratch, without any overlap with German or English, posed a real challenge. I soon realized that memorizing large amounts of vocabulary required a different strategy, which I'll discuss later. With Spanish, I needed a more formal approach to master conjugations and grammar rules. When it came to Chinese, I overcomplicated things at first by creating very specific drills for tones, vocabulary, and grammar. But this method worked for me. As an adult, I found I needed this structure to systematically learn a piece of the language and then go out and apply it in conversation. The more languages I learned, and the more complex they became, the more structured my approach had to be.\n\nOne book that influenced my Chinese learning was [Fluent Forever](https://www.amazon.com/dp/B00IBZ405W/?tag=toweofbabe-20) by Gabriel Wyner, which introduced techniques like spaced repetition and focusing on pronunciation early. While I didn\u2019t agree with everything, such as his \"never translate\" philosophy, I found many techniques helpful, especially as I became more systematic in my approach.\n\n#### Resources and Budget\n\nAnother trend I've noticed is that you can absolutely learn a language for free\u2014but with time as the primary currency. That\u2019s how I approached Indonesian: I didn\u2019t take any formal classes (because I had no budget for it), relying instead on self-study, conversation, and language exchanges. Only when being back in Germany and already having a decent job, I got a tutor for Indonesian. As my budget increased over the years, I began investing more in paid resources, which allowed me to learn faster and more efficiently. For both Spanish and Chinese, I found private tutors very helpful early on when it came to correcting grammar or pronunciation, and there were also a few useful and affordable apps.\n\nPronunciation has always been something I focus on from the very beginning, even if it feels strange at first. It\u2019s so much easier to get the sounds right from the start than to unlearn bad habits later. This wasn\u2019t a major issue for me with Indonesian or Spanish\u2014Indonesian pronunciation came fairly naturally, and I had already heard a lot of Spanish as a child. In fact, my Spanish accent has evolved over time due to traveling, blending influences from Spain, Bolivia, Peru, Colombia, Mexico, and more. But Chinese was a whole different beast. The tonal system is entirely foreign to European languages, so I spent weeks practicing pronunciation, including learning the Taiwanese phonetic script _Zhuyin_ (\u6ce8\u97f3). Though my tones are still shaky, I often get told that my overall pronunciation is decent, compared to many other foreigners, so I guess the extra effort was worth it.\n\nWhen it comes to memorizing vocabulary, I find rote memorization to be very inefficient. I rely on associations and _Eselsbr\xfccken_ (mnemonics in German) to create personal connections with words. Whether I\u2019m learning Indonesian, Spanish, or Chinese, I form vivid, often silly, stories around each word to make them stick in my memory. I engage as many senses and associations as possible. I also use spaced repetition systems (SRS) like Anki, which helps me review words at optimal intervals to prevent forgetting. To make it more effective, I add images, audio, and use new words in conversation as soon as possible. For Chinese characters, I used the [Hanzi Movie Method](https://www.mandarinblueprint.com/blog/chinese-mnemonics/) early on, creating stories for each character, which helped me manage the initial challenge of memorization. Over time, frequent exposure and practice have turned vocabulary learning into a more intuitive process for me.\n\nIn terms of resources, I\u2019ve used a mix of books, apps, and classes over the years. Apps like Anki are excellent for vocabulary retention through spaced repetition, and specialized dictionaries like [SpanishDict](https://www.spanishdict.com/) and [Pleco](https://www.pleco.com/) are indispensable for deeper language study. However, I\u2019m not a fan of apps like Duolingo that take you through a rigid, pre-defined path. They tend to focus on surface-level phrases rather than providing the fundamental principles needed to truly master a language.\n\nClasses, especially larger ones, have a similar limitation\u2014they tend to focus on exam preparation rather than providing the flexibility to explore a language freely. That said, I found that in the early stages of learning Chinese, classes were helpful. My Chinese was so bad that no one had the patience to talk to me, and a classroom setting provided a structured environment where I could build confidence and get a lot of exposure at a slower pace. The rigid structure was frustrating, but the Chinese-speaking environment was great at the time.\n\nUltimately, I believe self-study, private tutors, and language exchanges are the most flexible and effective methods. If you can afford a tutor, great; they can tailor the experience to your needs. If not, language exchanges offer a similar benefit at no cost (besides the time you invest in helping your partner learn your language). The most important resource, however, is motivation. I\u2019ve heard many people say they can\u2019t learn a language because they don\u2019t have the money, but with the vast amount of free resources available online, that\u2019s just an excuse. While in Indonesia, I learned the language without any paid classes, textbooks, or tutors\u2014just through immersion and talking to people.\n\nIn the end, the effort you put in is what determines success. No one else can teach you a language\u2014you have to learn it yourself. If you stay motivated, even with free resources, you can achieve fluency. For me, the driving force is the desire to connect with people from different cultures, to chat with everyone I meet. Whenever I get frustrated, I remind myself of that.\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_koeln.webp)\n\n### Cultural Insight Through Language and Multiple Personalities\n\nEach language I\u2019ve learned has been a window into both the culture of the people who speak it and different sides of my own personality. The structure, tone, and rhythm of each language carry traces of the cultural values of the speakers, and this has shaped how I express myself when speaking each one. Interestingly, I find that as I switch languages, I also switch between different versions of myself.\n\n#### English: A Global Tool and the \"Neutral Me\"\n\nEnglish, for me, isn\u2019t tied to any specific culture. It\u2019s a global language, a bridge that allows people from all over the world to communicate. I use it primarily in international contexts, rather than in any English-speaking country\u2014whether in my work in science and technology or for accessing resources online. English has become my default language for thinking, writing, and even dreaming, despite German being my native language.\n\nWhat\u2019s fascinating about English is that, rather than connecting me to the culture of any one country, it connects me to the world. It doesn\u2019t come with a fixed cultural identity; instead, it\u2019s a flexible tool that adapts to different contexts and people. For that reason, my English-speaking personality feels balanced\u2014like a \"neutral me\" that is open to both emotion and reason. In a way, English allows me to take the best from all worlds, without being tied to a specific cultural perspective. It\u2019s the version of me that\u2019s most adaptable.\n\n#### German: Precision, Directness, and Rationality\n\nGerman, on the other hand, feels deeply rooted in the culture it comes from. The precision of the language\u2014with its cases, grammatical structures, and ability to form long compound words\u2014mirrors the cultural values of thoroughness, order, and an obsession with rules that are often associated with Germany. German communication tends to be very direct, sometimes even blunt, and there\u2019s often little room for ambiguity in how ideas are expressed. Conversations in German usually get straight to the point, reflecting a cultural tendency toward efficiency and practicality. For whatever reason, the German national railway company strictly distances itself from those values.\n\nThis German directness, which often comes across as harsh to non-German speakers, influences my own personality when I speak German. I notice that I become more assertive and less in touch with my emotions. Humor in German can also be darker and more sarcastic, and my judgments tend to be sharper. When I switch into German, I\u2019m focused more on logic, reason, and facts, and less on how my words might be received emotionally. It\u2019s not that I\u2019m unfriendly in German (at least not on a German scale), but the language itself pulls me toward a more direct approach to communication.\n\n#### Spanish: Passionate, Expressive, and Emotionally Open\n\nSpanish is a completely different experience. The language itself, with its musicality, flow, and rich emotional vocabulary, reflects the warmth and passion of the cultures that speak it. Spanish is a language that encourages expressiveness, and this is deeply connected to the values of community, connection, and emotional openness that are common in both Spain and Latin America. The flexibility of verb conjugation and the use of diminutives often add emotional nuance to even the simplest statements, and conversations are often lively, punctuated by gestures and changes in tone.\n\nWhen I speak Spanish, I find that I\u2019m much more emotionally expressive. I say things in Spanish that would feel overly dramatic or cheesy in English or German, but in Spanish, they seem perfectly natural. My Spanish personality is more passionate, and I\u2019m not afraid to be more flirty or playful. It feels like the language gives me permission to lean into emotion and connection in a way that other languages don\u2019t. I can express myself more freely and even be a bit theatrical without feeling out of place, which makes Spanish conversations feel more intimate and engaging.\n\n#### Indonesian: Simplicity, Inclusiveness, and Playfulness\n\nIndonesian is, in many ways, a language that reflects the open and inclusive nature of the people who speak it. The language is remarkably simple, with no tenses, cases, or grammatical genders. This simplicity, rather than limiting communication, makes it accessible and pragmatic, which aligns with Indonesian culture\u2019s emphasis on community and harmony. I always found it easy to make friends with Indonesians, and the language is really suitable for that. Indonesian doesn\u2019t get bogged down by rules; it\u2019s more about getting the message across in a way that everyone can understand, and sounding nice while doing so. Even the many borrowed words from other languages (like Arabic, Dutch, and Sanskrit) reflect a cultural openness to outside influences.\n\nWhen I speak Indonesian, I find that I take on a lighter, more playful personality. Conversations in Indonesian tend to be more relaxed and humorous, with less emphasis on precision and more focus on maintaining a friendly, easy-going vibe. My Indonesian personality feels more pragmatic, not worrying too much about perfect grammar, and more focused on just getting things done. It\u2019s a version of me that doesn\u2019t take life too seriously, and it fits perfectly with the language\u2019s straightforward structure and the laid-back attitude of the people.\n\n#### Chinese: Context, Nuance, and a Developing Personality\n\nMandarin Chinese is unique in how deeply it relies on context and nuance, which mirrors the cultural focus on relationships and social harmony in Chinese-speaking societies. The language itself uses a tonal system, meaning that slight changes in pitch can alter the meaning of a word entirely. This requires a high level of attentiveness and sensitivity to subtle differences, much like how Chinese culture often values maintaining balance and understanding in social interactions.\n\nIn Chinese, context is everything. The same word can mean completely different things depending on the surrounding words, and this reflects the importance of understanding the whole situation, not just individual elements. This emphasis on context makes Mandarin feel less direct than German or English\u2014it\u2019s about reading between the lines and paying attention to what\u2019s unsaid as much as what\u2019s said.\nI haven\u2019t yet fully developed a \"Chinese personality,\" as I\u2019m still working on becoming fluent. As I continue to improve, I\u2019m curious about what my Chinese personality will evolve into.\n\nSpeaking multiple languages hasn\u2019t just helped me connect with different cultures\u2014it\u2019s also allowed me to explore different versions of myself. Each language brings out distinct characteristics in both how I think and how I interact with others. Whether it\u2019s the balanced, global \"me\" when I speak English, the rational and assertive side that comes out in German, the passionate and emotionally expressive version of myself in Spanish, or the playful and pragmatic personality that shines through in Indonesian, each language unlocks something unique.\nAnd as I continue to grow in Chinese, I look forward to discovering what side of me it will reveal. It\u2019s fascinating to think that learning a language isn\u2019t just about words and grammar\u2014it\u2019s about discovering new ways to see the world and new ways to be yourself.\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_jogja.webp)\n\n### Travel and Immersion: How Speaking the Language Makes a Difference\n\nLet\u2019s be honest: traveling with German doesn\u2019t get you very far unless you\u2019re in Germany, Austria or Switzerland (and even then, Swiss German is a whole other language\u2014I can barely understand it myself). That said, I\u2019m often surprised by how many people I meet abroad who are studying German or at least have an interest in it. Many dream of working in Germany one day, and it\u2019s always fascinating to hear why they\u2019ve chosen to tackle such a grammatically horrifying language.\n\nEnglish, of course, is much more practical for traveling. It\u2019s spoken all over the world, especially in airports, tour agencies, and other tourist-centric places. I\u2019ve found it\u2019s usually enough to get by. Still, I was surprised in Latin America by how many tour guides or hostel owners didn\u2019t actually speak English at all, even in popular tourist spots. While English is a great tool for navigating the basics, it\u2019s usually not enough if you want to go beyond the typical tourist experience and truly connect with people and places.\n\nThat\u2019s where learning the local language is the only thing that really helps. When you know the local language, you can talk to pretty much anyone\u2014not just the few who\u2019ve learned English. This opens the door to a whole new level of experience. You can ask people about their lives, their culture, their history, and they can tell it to you in their native language. There\u2019s something special about hearing stories directly in the language they were meant to be told in, without the filter of translation. To be fair, I also had wonderful experiences with local people in places that I didn\u2019t speak the language of. Usually English-speaking people I met on [Couchsurfing](https://www.couchsurfing.com/), but the number of those is, of course, much smaller.\n\nSpeaking the local language also helps you escape the typical tourist traps. You can ask locals for their personal recommendations, and those are almost always better than anything a guidebook or travel blog will suggest. On top of that, it often gets you better prices. I\u2019ve haggled my way to almost local rates at markets in Indonesia\u2014sometimes even getting into tourist attractions at the local price instead of the overpriced foreigner rate by explaining that I was living and working there as a volunteer teacher. Although, once I had to sing the national anthem to prove it\u2026_Indonesia raya, merdeka, merdeka_\u2026\n\nLanguage is also a key to participating in cultural events that would otherwise be a bit meaningless if you couldn\u2019t understand what was happening. Whether it\u2019s art shows, concerts, theater performances, or even joining a friend\u2019s family for dinner, speaking the language brings these experiences to life. I\u2019ve watched _Lucha Libre_ matches in Mexico, attended culture carnivals, karate classes, and concerts in Indonesia, theater plays in Bolivia, and taken part in local festivals pretty much everywhere\u2014things that would\u2019ve felt a bit out of reach, or maybe even boring, without knowing what was going on around me.\n\nIn some cases, speaking the local language isn\u2019t just a nice bonus, but a matter of security. There have been a couple of situations where I needed help but didn\u2019t dare ask because I wasn\u2019t confident I could explain what was wrong. It\u2019s a scary feeling to think that if something went really wrong, you wouldn\u2019t be able to verbally defend yourself, especially in places where there\u2019s a general suspicion toward foreigners. In contrast, there have been moments when I did speak the local language, and it made all the difference. People were willing to step in and help me, simply because I could explain the situation.\n\nEven when you only know the basics of a language, it can still make a huge difference. Just being able to say a few key words\u2014asking for food, directions, or help\u2014can get you through most situations. And modern translation apps are a blessing, too. I\u2019ve used them a lot to supplement my limited vocabulary, but nothing replaces the feeling of being able to communicate directly.\n\nSometimes, knowing just a little of the local language helps ease the tension in situations where people are afraid to speak English. In Taiwan, for example, I often ask if people can speak English when dealing with things like opening a bank account or getting my driving license, and the answer is usually, \u201cNo, not at all!\u201d But I know that\u2019s not entirely true\u2014English is taught in schools there, so most people have at least a basic grasp of it. The issue is more about the fear of making mistakes and feeling embarrassed. But if I start trying to speak in my far-from-perfect Chinese (most likely making a fool of myself in the process), suddenly people feel more comfortable meeting me halfway. They\u2019ll supplement the bits of Chinese I know with the bits of English they know, and somehow, we make it work. It\u2019s a small but powerful way of showing respect, and it breaks the ice.\n\nUltimately, just trying your best in the local language is often enough to win people\u2019s sympathy and kindness. In most places, even when I completely butcher the pronunciation or grammar, locals appreciate the effort and treat me much more warmly than if I\u2019d just relied on Google Translate or, even worse, expected them to speak English. It\u2019s not about being perfect; it\u2019s about showing that you care enough to try.\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/language-learning/tuna_mexico.webp)\n\n### Conclusion\n\nLearning and speaking multiple languages has profoundly shaped my understanding of the world and the different cultures I\u2019ve encountered. Each language brought me closer to the people who speak it, allowing me to experience places from the inside rather than as an outsider. From the structured precision of German to the emotional expressiveness of Spanish, the pragmatic simplicity of Indonesian, and the contextual complexity of Mandarin, each language opened a unique door to new experiences and a different version of myself.\n\nThat said, I think Chinese will be my last language\u2014at least for a long while. There\u2019s already so much linguistic chaos in my head that it's getting a bit crowded in there. Just learning Chinese and maintaining the other languages I know is already quite time-consuming. And as much as I\u2019ve enjoyed diving into the world of languages, my real passion lies elsewhere. My focus will always be on being a programmer and software engineer, a field that requires constant learning and the development of new skills. It\u2019s what I find most interesting, and it\u2019s the path I\u2019ll continue to prioritize as I move forward. While languages have been a rewarding part of my journey, programming is where I\u2019ll keep pushing myself to grow. But it will always feel great to put the languages I\u2019ve learned to use to connect to people everywhere.\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Activity Recognition with Smartphone Sensors","Using signal processing and machine learning to find out what you're doing",new Date("2022-02-05"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/activity-recognition/activitiyTitle.jpg","Can your smartphone's sensors tell you what you're doing?",["Data Science & AI/ML","Learning"],'**TL;DR:** I used sensor data from smartphones\' accelerometers and gyroscopes for human activity recognition by extracting features from their Fourier transform spectrum, power spectral density, and auto-correlation function, and training a XGBoost classifier on these features.\n\nBesides computer vision tasks, I may have some work coming up soon which involves working with data from a smartphone\'s sensors, such as the accelerometer and gyroscope. I haven\'t worked much in the realm of digital signal processing since I left academic research 3 years ago, so I though I should probably refresh my skills in that field, and of course combine it with machine learning. I wasn\'t to keen on collecting a large data set "just for fun" by myself so I looked for some accelerometer/gyroscope data sets online for training a model and then being able to do predictions with my own phone. I don\'t have a car myself so that\'s out for me, and I haven\'t found a data set for phones on bicycles, so I guess I\'ll have to do some moves with my own body. I found a promising data set for human activity recognition online, which I\'ll describe in more detail below. Here, I\'ll do a proof of concept to see if I can extract features from the data to train a machine learning model with. I\'d like to later build a progressive web app with predictive functionality, but unfortunately the numpy/scipy ecosystem isn\'t really available in JavaScript (though there are some partial implementations), and I\'ll need to see if it\'s not too much of a hassle to implement this myself. Anyways, this POC is already exiting, so let\'s have a look at the data.\n\n### The Data Set\n\nThe data set I use is the [Smartphone-Based Recognition of Human Activities and Postural Transitions Data Set](http://archive.ics.uci.edu/ml/datasets/Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions), that is, an updated version of the original data set that also contains the raw recorded data. I downloaded it from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/index.php) (donated by Jorge L. Reyes-Ortiz et al., you can find links to their papers on the data set site), but you can also find it on [Kaggle](https://www.kaggle.com/uciml/human-activity-recognition-with-smartphones) with plenty of code examples. This is a well know data set, and building a well-performing classifier is not much of a challenge. The authors already provide 561 pre-engineered features that have been derived from the raw data. The sensor signals have been pre-processed with noise filters and then sampled in fixed-width sliding windows of 2.56 seconds and 50% overlap, meaning 128 readings/window at a sampling frequency of 50 Hz. From each of these windows, the authors calculated for each sensor and axis the derivative (they called it jerk) and magnitude over all axes. On all XYZ axes, their jerks, and magnitudes, they then calculated various quantities from both the time and the frequency domain such as mean, standard deviation, median absolute deviation, max and min values, signal magnitude area, energy, interquartile range, signal entropy, correlation between two signals, mean frequencies, skewness, kurtosis, and some more (for details, please refer to their repo\'s dataset description in the file `features_info.txt`). This adds up to the proud number of 561 features, with which one can easily train a XGBoost classifier that scores 0.93 accuracy and F1.\n\nIf this is already a well solved problem, why do I even care about this data set? Well, because the authors already kinda took the whole fun out of it by doing all the feature engineering. I want to learn about signal processing, I want to play with raw sensor data, calculate their spectra, and look what the peaks in there tell me. So, I\'ll choose a different approach as compared to the authors of the data set. Instead of computing quantities over the whole sampled windows in the time and frequency domain, I\'ll start from the raw data, filter out gravity and noise, calculate the Fourier transform, power spectral density (using Welch\'s method), and auto-correlation function for each window, and use their peaks as features to train a machine learning model on. I\'ll also try to make the data somewhat orientation-independent by transforming them with a principal component analysis (PCA) and see how the results compare to the data with the original orientation.\n\nLet\'s start by reading the data from file and see how the initial data frame looks like:\n\n```python\nfrom typing import Tuple, List, Dict, Union\nimport pandas as pd\nimport numpy as np\nimport scipy\nfrom glob import glob\n\n# load the labels from file\nlabels = pd.read_csv(\n "./HAPT Data Set/RawData/labels.txt",\n header=None,\n sep=" ",\n names=["experiment", "user", "activity", "start", "stop"]\n)\n\n# load the raw sensor data from file\ndf = pd.DataFrame([], columns=[\n "acc_x", "acc_y", "acc_z", "acc_total",\n "gyro_x", "gyro_y", "gyro_z", "gyro_total",\n "activity"\n])\n# by looping over the data files of 61 experiments\nfor i in range(1, 62):\n acc_df = pd.read_csv(\n glob(f"./HAPT Data Set/RawData/acc_exp{str(i).zfill(2)}_*.txt")[0],\n header=None,\n sep=" ",\n names=["acc_x", "acc_y", "acc_z"]\n )\n acc_df["acc_total"] = np.sqrt(\n acc_df["acc_x"]**2 + acc_df["acc_y"]**2 + acc_df["acc_z"]**2\n )\n\n gyro_df = pd.read_csv(\n glob(f"./HAPT Data Set/RawData/gyro_exp{str(i).zfill(2)}_*.txt")[0],\n header=None,\n sep=" ",\n names=["gyro_x", "gyro_y", "gyro_z"]\n )\n gyro_df["gyro_total"] = np.sqrt(\n gyro_df["gyro_x"]**2 + gyro_df["gyro_y"]**2 + gyro_df["gyro_z"]**2\n )\n\n user = glob(f"./HAPT Data Set/RawData/gyro_exp{str(i).zfill(2)}_*.txt")[0][-6:-4]\n df_merged = acc_df.merge(gyro_df, left_index=True, right_index=True)\n df_merged["experiment"] = i\n df_merged["user"] = int(user)\n df_merged["activity"] = np.NaN\n\n # fill in the labels\n for _, label in labels[labels["experiment"] == i].iterrows():\n df_merged["activity"].iloc[label["start"]:label["stop"]+1] = label["activity"]\n\n df = pd.concat([df, df_merged], axis=0)\n\n# get rid of the activity transitions, they\'re not of interest\ndf = df[df.activity.between(1,6)]\n\ndf.sample(5)\n```\n\n```python\n acc_x acc_y acc_z acc_total gyro_x gyro_y gyro_z gyro_total activity experiment user\n5696 0.944444 -0.029167 -0.326389 0.999678 -0.024740 0.029627 -0.005192 0.038946 4.0 29.0 14.0\n17287 0.543056 -0.043056 0.031944 0.545695 -0.789543 0.349415 -0.208610 0.888250 3.0 5.0 3.0\n15789 1.119445 -0.506944 -0.081944 1.231610 0.099571 0.905913 -0.113926 0.918462 2.0 42.0 21.0\n12060 0.956944 -0.008333 -0.294444 1.001254 0.146302 0.066279 -0.180205 0.241394 5.0 34.0 17.0\n5166 0.279167 0.445833 0.848611 0.998420 0.019548 0.003054 0.000916 0.019806 6.0 32.0 16.0\n```\n\nAs you can see in the `activity` columns, there are activities numbered 1 to 6 to be classified:\n\n```\n1 WALKING\n2 WALKING_UPSTAIRS\n3 WALKING_DOWNSTAIRS\n4 SITTING\n5 STANDING\n6 LAYING\n```\n\nLet\'s have a look how the raw sensor data looks like for these activities. I\'ll just show you the plots here and not the full `matplotlib` code, as it is pretty verbose and doesn\'t really add anything to the understanding.\n\n![Sensor data corresponding to the different activities.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/activity-recognition/activities.svg)\n\n## Preprocessing the data\n\nThere are two things that I want to get rid off before building features: gravity and noise. The accelerometer measures the total acceleration, which means it also measures gravity. The positioning of the smartphones in the authors\' experiments is always constant so gravity should always appear along the same axis (and obviously stay constant), so it wouldn\'t add any useful information here. I\'ve read [in a paper](https://arxiv.org/pdf/1805.03368.pdf) of some method of using the gravity contribution on all three axes to transform an arbitrarily oriented sensor coordinate system to one that is uniformly oriented, thereby making the data collection orientation-independent, but that\'s not what I\'ll do here. The authors\' of the data set separated the gravity acceleration from the data with a filter, and that\'s what I\'ll to as well. I\'ll assume that the gravity contribution to the acceleration is of very low frequency, below 0.3 Hz, and use a high-pass filter to only let frequencies higher than 0.3 pass. Furthermore, most sensors will collect some noise in addition to the actual signal. This noise is usually of high frequencies, at least compared to normal human movements. The sampling frequency of these measurements is 50 Hz, so no frequency higher than half of that (the Nyquist frequency of 25 Hz) could be captured anyway. I\'ll assume that a humans shouldn\'t be able to do any movement more than 15 times a second, so I\'ll also apply a low-pass filter with a cutoff frequency of 15 Hz.\n\nI\'ll use a Butterworth filter as it\'s designed to have a frequency response as flat as possible in the passband, which means it doesn\'t alter the part of the signal that we want to keep too much. `Scipy.signal` offers diverse functionality for designing signal filters and applying them. We\'ll do this in two steps here: Getting the Butterworth filter coefficients and than apply the filter. The function used to apply the filter here is `scipy.signal.filtfilt`, which applies a linear digital filter twice, once forward and once backwards. This results in no phase shift in the signal, which probably isn\'t relevant for this use case, but could e.g. be important when denoising a spectrogram where the exact peak position really matters.\n\n```python\nfrom scipy.signal import butter, filtfilt, welch\nfrom scipy.fft import fft, fftfreq\n\n# function to design the filter coefficients\ndef butter_coef(\n cutoff: float,\n fs: float,\n order: int = 5,\n btype: str = "low"\n) -> Tuple[np.array, np.array]:\n nyq = fs / 2\n normalized_cutoff = cutoff / nyq\n b, a = butter(order, normalized_cutoff, btype=btype, analog=False)\n return b, a\n\n# function applying the filter\ndef butter_filter(\n data: np.array,\n cutoff: float,\n fs: float,\n order: int = 5,\n btype: str = "low"\n) -> np.array:\n b, a = butter_coef(cutoff, fs, order=order, btype=btype)\n y = filtfilt(b, a, data, method="gust")\n return y\n```\n\nNow, let\'s apply the Butterworth high-pass filter to separate out gravitational acceleration from the acceleration signals and a low-pass filter for noise reduction to all raw signals. Note that no gravitational contribution has to be removed from the gyroscope data; that\'s the reason why I don\'t combine both filters into one band-pass filter.\n\n```python\n# set some parameters first\ncutoff_lp=15 # cutoff frequency (in Hz) for the low-pass filter\ncutoff_hp=0.3 # cutoff frequency (in Hz) for the high-pass filter\norder=5 # order of the Butterworth filter\nfs = 50.0 # data sampling frequency (in Hz)\n\n# apply the filters\nfor col in df.columns.values[:-3]:\n # filter out gravity\n if col.startswith("acc"):\n df[col] = butter_filter(\n df[col],\n cutoff=cutoff_hp,\n fs=fs, order=order,\n btype="high"\n )\n\n # filter out noise\n df[col] = butter_filter(\n df[col],\n cutoff=cutoff_lp,\n fs=fs,\n order=order,\n btype="low"\n )\n```\n\nLet\'s have a look at how the data looks before and after filtering:\n\n![Sensor data before and after filtering.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/activity-recognition/filtered.svg)\n\n### Orientation-independence\n\nAs I mentioned before, I thought it may be a good idea to think about orientation-independence when trying to recognize human activities from smartphone sensor data. In the experiments from which the data set originates, the smartphones where always mounted in the same way with the same orientation at the hips of the users. This is nice if you want to investigate in which direction an activity accelerates the phone, relative to the user. But in reality, a user might have the phone crammed into any of her pockets or backpack or where ever, and would certainly not care about keeping the phone in the same orientation relative to their bodies at all times. There are a couple of ways we could do that. For example we could rotate the coordinate system in a way that maximizes the gravitational contribution along one axis an say that\'s the vertical axis. We could then consider the magnitude of the two remaining axis as one horizontal component, which is done in [this paper](https://arxiv.org/pdf/1805.03368.pdf). But I kinda want to keep three axes and I already removed gravity, so let\'s do something else. Think about variability: If you would plot every sensor measurement in 3D you\'ll end up with a blob of data points that may be oriented in different ways depending on the orientation of the phone, but should have a characteristic shape, depending on the measured activity. The blobs should then have a direction in which the variance is the highest. So, if we do a principal component analysis and find 3 principal components, we should be able to preserve the whole variance of the data but find a new coordinate system (the 3 ortho-normal principal components along the 3 directions of highest variance in the data) into which we can transform the data. I found [a paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5579846/) doing basically the same using the first 3 singular values and corresponding matrix columns from a singular value decomposition.\n\nHow exactly does that work? We consider that for every sampled window of data, in principal the orientation could be different, so we\'ll do the transform for each window of 2.56 seconds individually, based on the XYZ components of accelerometer and gyroscope. Since the two types of sensors should have a constant orientation relative to each other, we\'ll concat the data from both sensors and find the common principal components. To bring the variance of the two sensors onto the same scale, we\'ll standard-scale them first. Having the 3 principal components, we transform the data into the new coordinate system and separate the measurements of the two sensors again:\n\n```python\nfrom sklearn.decomposition import PCA\nfrom sklearn.preprocessing import StandardScaler\n\ndef orientation_invariant_transform(\n df_acc_xyz: pd.DataFrame,\n df_gyro_xyz: pd.DataFrame\n) -> pd.DataFrame:\n # scale the data by removing the mean and scaling to unit variance\n df_acc_xyz = pd.DataFrame(\n StandardScaler().fit_transform(df_acc_xyz),\n columns=df_acc_xyz.columns\n )\n df_gyro_xyz = pd.DataFrame(\n StandardScaler().fit_transform(df_gyro_xyz),\n columns=df_gyro_xyz.columns\n )\n\n # concat data from both sensors\n concat = np.concatenate([df_acc_xyz.values, df_gyro_xyz.values])\n\n # transform data into coordinate system of the principal components\n concat_oi = PCA(n_components=3).fit_transform(concat)\n\n # separate the two sensors\' data again\n df_acc_oi = pd.DataFrame(\n concat_oi[:len(concat_oi)//2],\n columns=df_acc_xyz.columns\n )\n df_gyro_oi = pd.DataFrame(\n concat_oi[len(concat_oi)//2:],\n columns=df_gyro_xyz.columns\n )\n # calculate the magnitude / Euclidean norm\n df_acc_oi["acc_total"] = np.sqrt(\n df_acc_oi["acc_x"]**2\n + df_acc_oi["acc_y"]**2\n + df_acc_oi["acc_z"]**2\n )\n df_gyro_oi["gyro_total"] = np.sqrt(\n df_gyro_oi["gyro_x"]**2\n + df_gyro_oi["gyro_y"]**2\n + df_gyro_oi["gyro_z"]**2\n )\n\n return df_acc_oi.merge(df_gyro_oi, left_index=True, right_index=True)\n```\n\nI only define the function here, it will be called later when the features for each data window are generated.\n\n### Features from time-frequency domain transformations\n\nI\'ve explained in some detail the features that the authors of the data set have generated from both the time and the frequency domain. I\'d like to create features here of which the meaning can be understood in a conceptual and graphical way. Specifically, I want to apply certain transforms to the data from which I will then extract the position and height of peaks. The transforms/functions I\'ll use are the Fourier transform, the power spectral density (using Welch\'s method), and the auto-correlation function.\n\nLet\'s quickly explain what these three are. Essentially, they all give us information about the periodicities in the signal. Imagine your smartphone recording your acceleration while you walk. I\'ve you\'re walking at a constant speed, the bumps in the acceleration signal should be spaced evenly over time. Maybe there will be some quick jerk movements that you do while walking that may appear in the signal at shorter intervals than your main steps. These movements overlay, making up the resulting signal, similar to different musical notes combining into one resulting sound wave. Just as with sound waves, we can either look at how the amplitude of the wave changes over time, or we transform the signal into the frequency domain and see which frequencies contribute with which amplitude to the periodic signal. If we\'d do that with the walking signal, we may see a large contribution at a lower frequency, corresponding to the main steps, and maybe some smaller contribution at higher frequencies, corresponding to quicker jerk movements. The Fourier transform (FT) does exactly that transformation from the time domain (where we see the raw signal) into the frequency domain (where we see the spectrum of the signal), by decomposing the original signal into its contributing frequencies. The power spectral density (PSD) is conceptually very similar. It calculates how the power of the signal is distributed over the frequencies in its spectrum, i.e. it\'s the spectral energy distribution that would be found per unit time. To calculate the PSD, we use [Welch\'s method](https://en.wikipedia.org/wiki/Welch%27s_method), which computes an estimate of the power spectral density by dividing the data into overlapping segments, computing a modified power spectrum for each segment and averaging the spectra. Lastly, the auto-correlation function kinda stays in the time domain but also measures a the periodicity of a signal. It calculates the correlation of a signal with a delayed copy of itself as a function of delay/time-lag. Basically that means, if a periodic sine signal exhibits a maximum every 2 seconds, the autocorrelation function would also show a peak at 2, 4, 6, 8 ... and so on seconds because that\'s where the lagged version of the signal is similar (or identical, in case of a noiseless signal) as the un-lagged signal.\n\nThis will all make much more sense if we plot it. In the plot below, we can see the original (already filtered) signal from the accelerometer when walking downstairs, its spectrum (the frequency domain representation) created by the Fourier transform, the power spectral density, and the autocorrelation function.\n\n![Original accelerometer data, its Fourier transform, power spectral density, and autocorrelation function.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/activity-recognition/features.svg)\n\nYou can see that the main "humps" in the original signal are about 0.65 seconds apart. This seems to be how long it takes to take a step when walking downstairs. We can find this main interval as well in the autocorrelation function where we find peaks at multiples of ca. 0.65 seconds of lag and in the Fourier transform and power spectral density, where we can see that the largest contribution to the spectrum and its power comes from a corresponding base frequency of ca. 1.54 Hz. I\'ve also marked some of the most prominent peaks occurring in these plots. The position and intensity of these peaks is what I\'ll use as features for the machine learning model here.\n\nI\'m not aware of a function that readily implements the estimated autocorrelation function, but we can easily calculate it using `numpy.correlate` to correlate the input with itself. We\'ll also normalize it by subtracting the mean and dividing by the variance:\n\n```python\ndef estimated_autocorrelation(x: np.array) -> np.array:\n N = len(x)\n variance = x.var()\n x = x-x.mean()\n r = np.correlate(x, x, mode = \'full\')[-N:]\n result = r/(variance*(np.arange(N, 0, -1)))\n return result\n```\n\nScipy provides functionality for calculating the the fourier transform (`scipy.fft.fft`) and its corresponding frequencies (`scipy.fft.fftfreq`), as well as Welch\'s method (`scipy.signal.welch`). The Fourier transform returns an array of complex numbers and for a real signal, it should be symmetric around 0, so we\'ll do a couple of things here: Take the magnitude of the complex numbers (with `abs`), drop all frequencies below 0, and normalize the outputs. So, for each component (meaning each axis and the magnitude of each sensor), we\'ll now get the 5 highest peaks of the Fourier transform, the power spectral density, and the autocorrelation function:\n\n```python\n# we\'ll consider only the top 5 highest peaks\ntop_n_peaks = 5\n\ndef find_n_highest_peaks(\n x: np.array,\n y: np.array,\n top_n: int\n) -> Tuple[np.array, np.array]:\n # find all present peaks\n peaks = scipy.signal.find_peaks(y)[0]\n heights = y[peaks]\n\n # if there are enough peaks, sort them from decreasing height\n if len(peaks) >= top_n:\n idx = heights.argsort()[:-top_n-1:-1]\n peaks = peaks[idx]\n return x[peaks], y[peaks]\n\n # if there are not enough peaks, sort the present ones\n # from decreasing height and pad the rest with zeros\n else:\n n_missing_peaks = top_n - len(peaks)\n idx = heights.argsort()[::-1]\n peaks = peaks[idx]\n return (\n np.concatenate([x[peaks], np.zeros(n_missing_peaks)]),\n np.concatenate([y[peaks], np.zeros(n_missing_peaks)])\n )\n\ndef get_peak_features_for_component(\n signal: pd.DataFrame,\n component: str,\n top_n_peaks: int,\n fs: float\n) -> pd.DataFrame:\n N = len(signal)\n\n # FFT - Fast Fourier Transform\n y_fft = 2.0/N * np.abs(fft(signal[component].values)[:N//2])\n x_fft = fftfreq(N, 1.0 / fs)[:N//2]\n peaks_fft_x, peaks_fft_y = find_n_highest_peaks(x_fft, y_fft, top_n_peaks)\n peaks_fft_sum = peaks_fft_x + peaks_fft_y\n\n # PSD - Power Spectral Denisty using Welch\'s method\n x_psd, y_psd = welch(signal[component].values, fs=fs)\n peaks_psd_x, peaks_psd_y = find_n_highest_peaks(x_psd, y_psd, top_n_peaks)\n peaks_psd_sum = peaks_psd_x + peaks_psd_y\n\n # ACF - estimated auto-correlation function\n y_acf = estimated_autocorrelation(signal[component].values)\n x_acf = np.array([1/fs * n for n in range(0, N)])\n peaks_acf_x, peaks_acf_y = find_n_highest_peaks(x_acf, y_acf, top_n_peaks)\n peaks_acf_sum = peaks_acf_x + peaks_acf_y\n\n # create the column names for the features of this component\n # per component there are\n # 3 (fft, psd, acf) * 3 (x, y, sum) * top_n_peaks features\n columns = []\n for feat in [\n "peaks_fft_x", "peaks_fft_y", "peaks_fft_sum",\n "peaks_psd_x", "peaks_psd_y", "peaks_psd_sum",\n "peaks_acf_x", "peaks_acf_y", "peaks_acf_sum",\n ]:\n for i in range(top_n_peaks):\n columns.append(f"{component}_{feat}_{i}")\n\n feature_values = np.concatenate([\n peaks_fft_x, peaks_fft_y, peaks_fft_sum,\n peaks_psd_x, peaks_psd_y, peaks_psd_sum,\n peaks_acf_x, peaks_acf_y, peaks_acf_sum\n ]).reshape(1,-1)\n\n return pd.DataFrame(feature_values, columns=columns)\n```\n\nThis will get us all the features we need for one component (meaning each axis and the magnitude of each sensor) and one time window. To process the entire data set of raw data, we\'ll loop over all time windows and components. Just as the authors of the data set did, we\'ll use fixed-width sliding windows of 2.56 seconds and 50% overlap, meaning 128 readings/window at a sampling frequency of 50 Hz:\n\n```python\n# the data is sampled in fixed-width sliding windows of 2.56 sec and 50% overlap (128 readings/window)\nwindow_length = 128\noverlap_length = window_length // 2\n\ndef create_features(\n df: pd.DataFrame,\n window_length: int,\n overlap_length: int,\n top_n_peaks: int,\n fs:float,\n oi_transform: bool,\n calc_jerk: bool = True\n) -> pd.DataFrame:\n # the final output will be saved here\n df_features = None\n\n # loop over all windows\n for i in range(0, len(df), overlap_length):\n signal = df.iloc[i:i+window_length]\n\n # if the window contains more than 1 activity, skip it\n if signal["activity"].nunique() > 1:\n continue\n\n # if desired, to the orientation-invariant transform\n if oi_transform:\n df_oi = orientation_invariant_transform(\n signal[["acc_x", "acc_y", "acc_z"]],\n signal[["gyro_x", "gyro_y", "gyro_z"]]\n )\n signal = pd.concat(\n [df_oi, signal.iloc[:,-3:].reset_index(drop=True)],\n axis=1\n )\n\n # loop over all components and create features for them\n feature_row = None\n for component in signal.columns[:-3]:\n component_features = get_peak_features_for_component(\n signal,\n component,\n top_n_peaks,\n fs\n )\n feature_row = (feature_row.merge(\n component_features,\n left_index=True,\n right_index=True\n )\n if not feature_row is None\n else component_features)\n\n # set the label for the window\n feature_row["activity"] = signal["activity"].iloc[0]\n\n # append the window to the final feature data frame\n df_features = (pd.concat([df_features, feature_row])\n if not df_features is None\n else feature_row)\n\n return df_features.reset_index(drop=True)\n```\n\n### Training a machine learning model\n\nNow we finally have everything together to see how a model would perform on these features. The authors of the data set have provided a train-test-split that ensures that no users are in both the training and test set. This way we test if a model learns to truly generalize well from one person to another and not just learns the moves of a particular person by hard.\n\n```python\nwith open("./HAPT Data Set/Train/subject_id_train.txt", "r") as file:\n train_subjects = set([float(u) for u in file.read().splitlines()])\n\nwith open("./HAPT Data Set/Test/subject_id_test.txt", "r") as file:\n test_subjects = set([float(u) for u in file.read().splitlines()])\n\nprint(\n f"Test fraction: " f"{len(test_subjects) / (len(train_subjects) + len(test_subjects))}"\n)\n```\n\n```python\nTest fraction: 0.3\n```\n\nLet\'s create the feature data for both the data with the original orientation as well as for the data transformed to be orientation-independent:\n\n```python\n# for the original orientation\n# train set\ndf_features_train = create_features(\n df[df["user"].isin(train_subjects)],\n window_length,\n overlap_length,\n top_n_peaks,\n fs,\n oi_transform=False\n)\n# test set\ndf_features_test = create_features(\n df[df["user"].isin(test_subjects)],\n window_length,\n overlap_length,\n top_n_peaks,\n fs,\n oi_transform=False,\n)\n# split features and labels\nX_train = df_features_train.drop("activity", axis=1)\ny_train = df_features_train["activity"]\nX_test = df_features_test.drop("activity", axis=1)\ny_test = df_features_test["activity"]\n\n# for the orientation-independent transformation\n# train set\ndf_features_oi_train = create_features(\n df[df["user"].isin(train_subjects)],\n window_length,\n overlap_length,\n top_n_peaks,\n fs,\n oi_transform=True\n)\n# test set\ndf_features_oi_test = create_features(\n df[df["user"].isin(test_subjects)],\n window_length,\n overlap_length,\n top_n_peaks,\n fs,\n oi_transform=True,\n)\n# split features and labels\nX_train_oi = df_features_oi_train.drop("activity", axis=1)\ny_train_oi = df_features_oi_train["activity"]\nX_test_oi = df_features_oi_test.drop("activity", axis=1)\ny_test_oi = df_features_oi_test["activity"]\n```\n\nI\'ve ran a small randomized hyper-parameter optimization and came up with the following hyper-parameters for a XGBoost classifier:\n\n```python\nbest_params = {\n\'n_estimators\': 300,\n \'min_child_weight\': 1,\n \'max_depth\': 3,\n \'lambda\': 1,\n \'eta\': 0.5\n}\n```\n\nNow let\'s see how well the two versions of features perform:\n\n```python\nfrom xgboost import XGBClassifier\nfrom sklearn.metrics import classification_report\n\nlabels = [\n "WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS",\n "SITTING", "STANDING", "LAYING"\n]\n\n# train and score a classifier on the original orientation data\nclf = XGBClassifier(**best_params).fit(X_train, y_train)\nprint(classification_report(y_test, clf.predict(X_test), target_names=labels))\n```\n\n```\n precision recall f1-score support\n\n WALKING 0.98 0.99 0.98 514\n WALKING_UPSTAIRS 0.94 0.97 0.96 449\nWALKING_DOWNSTAIRS 0.98 0.94 0.96 402\n SITTING 0.82 0.82 0.82 483\n STANDING 0.86 0.90 0.88 540\n LAYING 0.92 0.88 0.90 530\n\n accuracy 0.91 2918\n macro avg 0.92 0.91 0.92 2918\n weighted avg 0.91 0.91 0.91 2918\n```\n\n```python\n# train and score a classifier on the orientation-independent data\nclf_oi = XGBClassifier(**best_params).fit(X_train_oi, y_train_oi)\nprint(classification_report(\n y_test_oi,\n clf_oi.predict(X_test_oi),\n target_names=labels\n))\n```\n\n```\n precision recall f1-score support\n\n WALKING 0.96 0.93 0.94 514\n WALKING_UPSTAIRS 0.95 0.94 0.94 449\nWALKING_DOWNSTAIRS 0.92 0.93 0.93 402\n SITTING 0.51 0.47 0.49 483\n STANDING 0.66 0.77 0.71 540\n LAYING 0.69 0.65 0.66 530\n\n accuracy 0.77 2918\n macro avg 0.78 0.78 0.78 2918\n weighted avg 0.77 0.77 0.77 2918\n```\n\nWe can see that the original model performs pretty well, at an overall accuracy of 91%, it\'s only 2% worse than what I scored with the features provided by the authors of the data set. It is evident that it performs particularly well at recognizing the activities that involve a lot of movement such as walking (up or downstairs), but performs worse on the "still" activities of sitting, standing, and laying. This is even more clearly visible on the orientation-independent data. The overall accuracy has dropped a lot to 77% percent, but if we look more closely at the individual classes we see that this is mostly to a decrease in performance for the "still" activities. The walking activities are still recognized fairly well. This makes sense if we remember how we try to get the orientation-independence: We transform the data into a new coordinate basis defined by 3 principal components. These are the ortho-normal components along the directions of highest variance in the data. For the walking activities, where we have a lot of movement, the variance along certain directions should be clearly higher than along other directions; hence, the transformation works relatively reliably for different examples. But for the "still" activities, the movements and the recorded acceleration/angular velocity probably results only from body shaking or breathing, is much smaller in magnitude and variance, and the variance is probably less directional. Hence, the transformation works less reliably for different examples. I think using the principal component analysis is a good option for activities that involve sufficient amounts of movement, but I\'d like to apply it to data from many sensors that have actually been mounted in different orientation to see if it really works. Something like that was done with singular value composition in [this paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5579846/), and their results are pretty good.\n\n### Conclusion\n\nWe\'ve used a data set containing sensor data from smartphones for recognizing human activities. We have seen how we can use digital signal processing techniques to filter noise from the raw data, calculate various transforms on it such as the Fourier transform, power spectral density, and autocorrelation function, and extract their peaks as features for a machine learning model. We\'ve also seen how principal component analysis can be used to transform the data in a way that is invariant to sensor orientation. Finally, we\'ve trained XGBoost classifiers on both versions (original orientation and orientation-independent) of the data. Both versions of features perform well on activities that involve a lot of movement, but the one transformed with principal component analysis has trouble recognizing activities with less movement due to the low variance in the corresponding data.\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Instance segmentation for fashion","Finding and classifying apparel in images with Mask R-CNNs",new Date("2021-12-12"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/fashion_segmentation.png","Segment fashion images to let AI figure out what people are wearing!",["Data Science & AI/ML","Learning"],'At the moment, I\'m super exited about computer vision with Deep Learning, as you can probably tell by looking at most of my last posts. I after my [dog breed classifier](https://pascal-bliem.com/doggo-snap), I recently had a look at object detection with [YOLO](https://pascal-bliem.com/blog/object%20detection%20yolo), and now I\'m looking at the somewhat similar problem of instance segmentation. A friend of mine is co-founder of the sustainable fashion startup [La Vouga](https://lavouga.com/), which connects customers in search of ethical and sustainable fashion with independent artisan makers across Europe. We recently talked about how it would be great to find clothing matching certain search terms on their partners\' websites, without having to rely on (sometimes weird and heterogenous) product descriptions. If we could just let an AI tag relevant images with the type of apparel and descriptive attributes, this would greatly help customers to find the hand-crafted slow-fashion they\'re looking for, by just typing what they\'re looking for or uploading images with similar clothing at La Vouga\'s search.\n\nIf we want to treat this scenario as a computer vision task, we\'re dealing with instance segmentation. That means we\'re not just detection an object (clothing in this case) or semantically segmenting the entire image into different classes, but we want to find the exact pixel locations of the different instances of clothing in the image so that we can have a look at them individually and extract descriptive attributes for each instance. To see if this is promising in a proof of concept, we of course don\'t want to collect our own annotated training data set yet; hence, I\'ll describe in the following what kind of data is out there that can be used for this task, and what kind Deep Learning architectures may be able to solve the problem. In the end I will train a Mask R-CNN to perform instance segmentation on fashion images, but only with classifying the instances into apparel categories - not yet with predicting descriptive attributes. I\'m planning the modify some of the detection models\' source code in [torchvision](https://github.com/pytorch/vision) to incorporate the attribute prediction as well, but this may take some time, so I\'ll postpone that part to a future post. Okay, let\'s get started.\n\n### Fashion image segmentation datasets\n\nWhen I started researching this topic, I was really surprised how many different datasets and papers were out there that dealt with computer vision tasks on fashion images. But actually, it makes a lot of sense. The clothing e-commerce industry is huge, and there\'s a huge amount of money to earn; naturally, people are investing in AI research to find out what people are wearing, how it looks on them, track rising fashion trends, or matching products from commercials with real people wearing them.\n\nMost machine learning practitioners have probably encounter the Fashion-MNIST dataset (not the one with handwritten digits) at some point, as it is a popular Kaggle [competition](https://www.kaggle.com/zalando-research/fashionmnist) for beginners. The dataset contains small gray scale images of clothing products from 10 different categories, but no humans are attached to these pieces of apparel. Some people have tried identifying clothing on people as early as 2012 at [chictopia.com](http://www.chictopia.com/). In the paper [\u201cParsing clothing in fashion photographs\u201d by Yamaguchi et al. 2012](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.233.840&rep=rep1&type=pdf), the authors set up the Fashionista dataset containing ca. 650 manually annotated images with clothing segmentation and pose estimation. They extended it with the Paperdoll dataset from the paper ["Retrieving Similar Styles to Parse Clothing" by Yamaguchi et al. 2014](http://vision.is.tohoku.ac.jp/~kyamagu/papers/yamaguchi2014retrieving.pdf), in which they automatically/weakly annotated a million images. Again from chictopia.com, the [Chictopia10K dataset](https://github.com/lemondan/HumanParsing-Dataset) was presented in ["Deep Human Parsing with Active Template Regression" by Liang et al. 2015](https://arxiv.org/abs/1503.02391). You can find an example of the segmentation masks in the figure below. This latter dataset was e.g. used in ["A Generative Model of People in Clothing" by Lassner et al. 2017](https://arxiv.org/abs/1705.04098) to build generative models that can produce realistic images of humans in different poses (though their faces look like horrifying demons from hell \ud83d\ude05).\n\n![Segmentations predicted on the Chictopia10K dataset in Liang et al. 2015](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/image_masks.png)\n\nThese datasets were all pretty interesting, but I was looking for something a bit more modern, with more classes, attributes and especially more training examples. Ebay\'s researchers have released a large (ca. 50k images) dataset called ModaNet in ["ModaNet: A Large-scale Street Fashion Dataset with Polygon Annotations" by Zheng et al. 2019](https://arxiv.org/abs/1807.01394), which contains 13 meta-categories of clothing. Another large (ca. 44k images) dataset is [DeepFashion2](https://github.com/switchablenorms/DeepFashion2) described in ["DeepFashion2: A Versatile Benchmark for Detection, Pose Estimation, Segmentation and Re-Identification of Clothing Images" by Ge et al. 2019](https://arxiv.org/abs/1901.07973v1), which also contains 13 different categories of clothing. Those two datasets are amazing, but I was still looking for something with a more detailed classification and descriptive attributes. Luckily, I found exactly that on Kaggle in the [iMaterialist (Fashion) 2019 for Fine-Grained segmentation competition](https://www.kaggle.com/c/imaterialist-fashion-2019-FGVC6/overview). This dataset (an image from it is displayed as the title image of this post) contains 46 apparel objects (27 main apparel items and 19 apparel parts), and 92 related fine-grained attributes in ca. 50k clothing images. This sounds like just the right dataset to do the proof of concept for our fashion image tagging AI. So now that we got the data, its time to think about the model.\n\n### Finding a model for instance segmentation\n\nAs mentioned shortly in the introduction, we\'re dealing with the problem of instance segmentation, which we can consider a combination of object detection, classification, and semantic segmentation. That means we\'ll need a couple of things to be achieved by the model architecture. One the one hand, object detection usually involves finding suitable bounding boxes for objects and classifying what is inside of those boxes. We don\'t really need to output another image on the backside of the model, but rather find the right class id and regress the bounding box coordinates. We don\'t really need to care about if the object was in a certain pixel of the original image and, hence, can usually use some kind of fully connected layers as a predictor for the neural network. Some examples of this would be the [YOLO](https://arxiv.org/abs/1506.02640) model (on which I\'ve previously written a [post](https://pascal-bliem.com/blog/object%20detection%20yolo)) which splits the original image into a grid and makes predictions for each cell in only one pass through the network, or regional convolutional neural networks (R-CNNs), which perform classifications for a certain number of region proposals and which I\'ll explain in more detail later. On the other hand, semantic segmentation really wants to know which pixel in the original image belongs to a certain class of object or background; we basically need to find an image mask that represents the presence of each class in the original image. One choice for this task would be [fully convolutional networks](https://arxiv.org/abs/1411.4038) (FCNs), in which the final dense layers are replaced by convolutions that output a feature map, which can be upsampled to the original images size and act as a heat map for the presence of a certain class. Another, more complex choice could be [U-Net](https://arxiv.org/abs/1505.04597), a architecture that combines two CNN parts in an encoder-decoder structure to output segmentation maps.\n\nIf we combine these two tasks, we end up with instance segmentation: locate the object, classify it, and find its segment on the original image. The model I\'ll be using for this is called [Mask R-CNN](https://arxiv.org/abs/1703.06870). It basically starts from a R-CNN for object detection and adds another branch to the model that outputs the segmentation masks. But do understand this thoroughly, we should go way back to the question "What are R-CNNs?". [Regional convolutional neural networks](https://arxiv.org/abs/1311.2524) have a region proposal mechanism which identifies regions of interest (ROI) in the original image (more details will follow) and then sends this region through a convolutional neural network that acts mostly as a classifier. This is performed on every proposed region, so that means for maybe 2000 proposals, it needs to run 2000 times. Obviously, that will take a lot of time. An improved version, called [Fast R-CNN](https://arxiv.org/abs/1504.08083) was proposed, which passed the entire image through a CNN once and then projects the proposed regions of interest on the output feature map of the CNN, and pools/maps the regions into a fully connected predictor. You can see that visualized in the figure below.\n\n![The concept of the Fast R-CNN (from Girshick et al. 2015)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/fast_rcnn.png)\n\nThis is already a lot faster than the original R-CNN, but there is still a bottle neck that makes it slow and that is the region proposal mechanism. It is usually some kind of selective search in which some initial regions are proposed and then iteratively merged by a greedy algorithm into larger regions based on heuristics such as similarity in color, texture, or size, until the wanted amount of proposals is reached. This takes relatively long, cannot always be performed on GPU with the rest of the network, and basically prevents real-time application. Luckily, there\'s another improvement called [Faster R-CNN](https://arxiv.org/abs/1506.01497), which comes with its own region proposal network (RPN), a fully convolutional network that simultaneously predicts object bounds and objectness (how much does this seem like an object?) scores at each position. This RPN shares its convolutional layers with the object detection network. This idea is visualized in the figure below.\n\nThe region proposal network (RPN) in Faster R-CNN (from Ren et al. 2016)\n\nDuring training, the objective alternates between fine-tuning for the region proposal task and then fine-tuning for object detection, while keeping the proposals fixed. This scheme produces a unified network with convolutional features that are shared between both tasks. Region proposals are generated by sliding a smaller network over the convolutional feature maps, and at each position of the window, up to k anchor boxes are proposed, which have different scales and aspect ratios (the authors chose k=9 with 3 scales and 3 aspect ratios). These k anchors are then send to two fully connected layers, a bounding box regressor, and a classifier, which are then fine tuned (remember that the conv layers are shared between the region proposal network and the Fast R-CNN detector). You can see this anchor region proposal scheme visualized below.\n\n![The region proposal mechanism with anchor boxes in Faster R-CNN (from Ren et al. 2016)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/anchors_faster_rcnn.png)\n\nThis now means that the entire process of region proposal, classification, and bounding box regression can be done in one neural network and it is fast enough for real-time applications. Now we\'re almost there. We got the object detection part covered, now comes the segmentation part. Another development on top of Faster R-CNN ist [Mask R-CNN](https://arxiv.org/abs/1703.06870). It looks almost exactly like Faster R-CNN except one obvious and one less obvious difference. In addition to the class and bounding box prediction, it has an additional, fully decoupled head that adds another convolutional layer to predict a segmentation mask. When predicting masks, it is important to preserve the exact pixel locations. The Faster R-CNN\'s ROI pooling, however, is a coarse spatial quantization for feature extraction, which had to be replaced by a ROI alignment layer that preserves the exact pixel locations. You can see the architecture below. Now, I think, we got all the theoretical knowledge we need and can implement it.\n\n![The Mask R-CNN architecture (from He et al. 2018)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/mask_rcnn.png)\n\n### Setting up a Mask R-CNN\n\nThis architecture is already pretty advanced and there are a lot of elements that have to be plugged together: the backbone CNN, the [feature pyramid network](https://arxiv.org/abs/1612.03144), region proposal network, ROI alignment, the predictor heads for classification, bounding box regression, and mask prediction, as well as the respective losses. Coding this from scratch would be a lot of code. But luckily, [torchvision](https://github.com/pytorch/vision), PyTorch\'s library for computer vision, has a bunch of models already pre-implemented. The have a sub-module `torchvision.models.detection`, which hosts a variety of different R-CNN models, including Mask R-CNN. It also comes as a pretrained version with a [ResNet50](https://arxiv.org/abs/1512.03385) backbone that has been pretrained on the [COCO](https://cocodataset.org/#home) dataset.\n\nOn PyTorch\'s website, we can also find a really useful [tutorial](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html) that shows how we can use their Mask R-CNN and set up some new predictor heads for a custom dataset:\n\n```python\n# as in the PyTorch tutorial https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html\nimport torch\nfrom torchvision.models.detection.faster_rcnn import FastRCNNPredictor\nfrom torchvision.models.detection.mask_rcnn import MaskRCNNPredictor\n\n\ndef get_instance_segmentation_model(num_classes):\n # load an instance segmentation model pre-trained on COCO\n model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)\n\n # get number of input features for the classifier\n in_features = model.roi_heads.box_predictor.cls_score.in_features\n # replace the pre-trained head with a new one\n model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)\n\n # now get the number of input features for the mask classifier\n in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels\n hidden_layer_channels = 256\n # and replace the mask predictor with a new one\n model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,\n hidden_layer_channels,\n num_classes)\n\n return model\n```\n\nThis implementation expects a certain input. As targets, we need to pass a class label, a tensor with bounding box coordinates, as well as a tensor representing the masks. Therefore, we first need to set up a custom PyTorch dataset to get our data into the right format. I got the implementation below mostly form [this Kaggle Kernel](https://www.kaggle.com/abhishek/mask-rcnn-using-torchvision-0-17) from Abhishek Thakur and made some adjustments to incorporate the descriptive attributes in the data as well. Some of the modules I import here are utilities that can be found in the torchvision Github repo at [`vision/references/detection/`](https://github.com/pytorch/vision/tree/master/references/detection). First of all, we need to convert the mask information, which is present in run-length encoding, into an array/tensor that represents a binary image mask.\n\n```python\nimport numpy as np\n\ndef rle_decode(mask_rle, shape):\n """Returns binary numpy array according to the shape,\n 1 for the mask, 0 for the background.\n\n Args:\n mask_rle: in 1d array of run-length encoding as string\n [start0] [length0] [start1] [length1]...\n shape: Shape of array to return (height,width)\n\n Returns:\n mask: The image mask as a numpy array of shape (height, width)\n\n """\n shape = (shape[1], shape[0])\n s = mask_rle.split()\n # gets starts & lengths 1d arrays\n starts, lengths = [\n np.asarray(x, dtype=int) for x in (s[0::2], s[1::2])\n ]\n starts -= 1\n # gets ends 1d array\n ends = starts + lengths\n # creates blank mask image 1d array\n img = np.zeros(shape[0] * shape[1], dtype=np.uint8)\n # sets mask pixels\n for lo, hi in zip(starts, ends):\n img[lo:hi] = 1\n # reshape as a 2d mask image, the transpose\n # is needed to align to RLE direction\n return img.reshape(shape).T\n```\n\nWe can then implement the custom dataset:\n\n```python\nimport collections\nimport import pandas as pd\nfrom PIL import Image\n\n# the implementation of the dataset is similar to Abhishek Thakur\'s kernel https://www.kaggle.com/abhishek/mask-rcnn-using-torchvision-0-17\nclass FashionDataset(torch.utils.data.Dataset):\n def __init__(self, image_dir, df_path, height, width, num_attributes, transforms=None):\n self.transforms = transforms\n self.image_dir = image_dir\n self.df = pd.read_csv(df_path)\n self.height = height\n self.width = width\n self.image_info = collections.defaultdict(dict)\n # ClassId contains categories as well as attributes,\n # we extract category here\n self.df[\'CategoryId\'] = self.df.ClassId.apply(\n lambda x: str(x).split("_")[0]\n )\n # add the descriptive attributes as well\n self.df[\'AttributesIds\'] = (\n self.df[\'AttributesIds\']\n .apply(lambda x: str(x).split(","))\n .apply(lambda idx: [\n 1 if (str(i) in idx) else 0\n for i in range(num_attributes)\n ])\n )\n\n # for each image, put all encodings and corresponding categories in lists\n temp_df = (self.df.groupby(\'ImageId\')[\'EncodedPixels\', \'CategoryId\']\n .agg(lambda x: list(x)).reset_index())\n # the image dimensions\n size_df = (\n self.df.groupby(\'ImageId\')[\'Height\', \'Width\']\n .mean()\n .reset_index()\n )\n temp_df = temp_df.merge(size_df, on=\'ImageId\', how=\'left\')\n\n # store all the relevant infos for each image in the image_info dict\n for index, row in tqdm(temp_df.iterrows(), total=len(temp_df)):\n image_id = row[\'ImageId\']\n image_path = os.path.join(self.image_dir, f"{image_id}.jpg")\n self.image_info[index]["image_id"] = image_id\n self.image_info[index]["image_path"] = image_path\n self.image_info[index]["width"] = self.width\n self.image_info[index]["height"] = self.height\n self.image_info[index]["labels"] = row["CategoryId"]\n self.image_info[index][\'attributes\'] = row[\'AttributesIds\']\n self.image_info[index]["orig_height"] = row["Height"]\n self.image_info[index]["orig_width"] = row["Width"]\n self.image_info[index]["annotations"] = row["EncodedPixels"]\n\n def __getitem__(self, idx):\n # load images ad masks\n img_path = self.image_info[idx]["image_path"]\n img = Image.open(img_path).convert("RGB")\n img = img.resize((self.width, self.height), resample=Image.BILINEAR)\n\n info = self.image_info[idx]\n # create a mask for all objects in the image of shape\n # (num_obj, width, height)\n mask = np.zeros(\n (len(info[\'annotations\']), self.width, self.height),\n dtype=np.uint8\n )\n\n labels = []\n attributes = []\n # create the submasks for each object by decoding them from run_length\n # format to array of shape (orig_width, orig_height) and then resize\n # to (width, height)\n for m, (annotation, label, attribute) in enumerate(zip(\n info[\'annotations\'], info[\'labels\'], info[\'attributes\']\n )):\n sub_mask = rle_decode(\n annotation,\n (info[\'orig_height\'], info[\'orig_width\'])\n )\n sub_mask = Image.fromarray(sub_mask)\n sub_mask = sub_mask.resize(\n (self.width, self.height),\n resample=Image.BILINEAR\n )\n mask[m, :, :] = sub_mask\n # here we +1 the category label because the label numbering\n # starts at 0 but we want to consider 0 to be the background\n labels.append(int(label) + 1)\n attributes.append(attribute)\n\n # create bounding boxes for the objects and filter out objects\n # that are very small (blelow 20*20 pixels)\n num_objs = len(labels)\n boxes = []\n new_labels = []\n new_attributes = []\n new_masks = []\n\n for i in range(num_objs):\n try:\n pos = np.where(mask[i, :, :])\n xmin = np.min(pos[1])\n xmax = np.max(pos[1])\n ymin = np.min(pos[0])\n ymax = np.max(pos[0])\n if abs(xmax - xmin) >= 20 and abs(ymax - ymin) >= 20:\n boxes.append([xmin, ymin, xmax, ymax])\n new_labels.append(labels[i])\n new_attributes.append(attributes[i])\n new_masks.append(mask[i, :, :])\n except ValueError:\n continue\n\n # if there are no labels left, put in a background dummy\n if len(new_labels) == 0:\n boxes.append([0, 0, 20, 20])\n new_labels.append(0)\n new_attributes.append(np.zeroes(num_attributes))\n new_masks.append(mask[0, :, :])\n\n # recombine the new masks into one array\n nmx = np.zeros(\n (len(new_masks), self.width, self.height),\n dtype=np.uint8\n )\n for i, n in enumerate(new_masks):\n nmx[i, :, :] = n\n\n # convert bounding boxes, masks, labels and idx to torch tensor\n boxes = torch.as_tensor(boxes, dtype=torch.float32)\n labels = torch.as_tensor(new_labels, dtype=torch.int64)\n masks = torch.as_tensor(nmx, dtype=torch.uint8)\n attributes = torch.as_tensor(new_attributes, dtype=torch.int64)\n image_id = torch.tensor([idx])\n # calculate area of the bounding boxes\n area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])\n # in the example from the PyTorch tutorial, people are segmented and\n # there is a flag for crowds of people - this is irrelevant here,\n # so we\'ll set it to zero\n iscrowd = torch.zeros((num_objs,), dtype=torch.int64)\n\n target = {}\n target["boxes"] = boxes\n target["labels"] = labels\n target[\'attributes\'] = attributes\n target["masks"] = masks\n target["image_id"] = image_id\n target["area"] = area\n target["iscrowd"] = iscrowd\n\n if self.transforms is not None:\n img, target = self.transforms(img, target)\n\n return img, target\n\n def __len__(self):\n return len(self.image_info)\n```\n\nFor data augmentation purpose, we might want to include some image transforms in the dataset. Since the normal trochvision transform only operate on images, but we also need to transform the masks and bounding boxes accordingly, we will use custom transforms here which can be found under [`vision/references/detection/transforms.py`](https://github.com/pytorch/vision/tree/master/references/detection/transforms.py).\n\n```python\nimport transforms as T\n\ndef get_transform(train):\n transforms = []\n # converts the image, a PIL image, into a PyTorch Tensor\n transforms.append(T.ToTensor())\n if train:\n # during training, randomly flip the training images\n # and ground-truth for data augmentation\n transforms.append(T.RandomHorizontalFlip(0.5))\n return T.Compose(transforms)\n```\n\nNow we can set up a dataset and data loader for training.\n\n```python\nDATA_DIR = "path/to/data"\n# +1 because we consider the background to be class 0\nnum_classes = 46 + 1\nnum_attributes = 341\nbatch_size = 4\n\ndataset = FashionDataset(\n image_dir=os.path.join(DATA_DIR, "train"),\n df_path=os.path.join(DATA_DIR, "train.csv"),\n height=512,\n width=512,\n num_attributes=num_attributes,\n transforms=get_transform(train=True)\n)\n\ndata_loader = torch.utils.data.DataLoader(\n dataset,\n batch_size=batch_size,\n shuffle=True,\n num_workers=2,\n collate_fn=lambda x: tuple(zip(*x))\n)\n```\n\nThe rest will be a fairly standard PyTorch training loop, which is relatively simple because all losses are calculated within the model and returned as a dictionary during training.\n\n```python\n# get the model\nmodel = get_instance_segmentation_model(num_classes)\nmodel.to(device)\n\n# set up optimizer and learning rate scheduler\nparams = [p for p in model.parameters() if p.requires_grad]\noptimizer = torch.optim.SGD(\n params,\n lr=0.001,\n momentum=0.9,\n weight_decay=0.0005\n)\n\nlr_scheduler = torch.optim.lr_scheduler.StepLR(\n optimizer,\n step_size=10,\n gamma=0.1\n)\n\nmodel.train()\nnum_epochs = 10\n\nfor epoch in range(1, num_epochs+1):\n\n for i, (images, targets) in enumerate(data_loader):\n # move tensors to GPU\n images = [image.to(device) for image in images]\n targets = [{k: v.to(device) for k, v in t.items()} for t in targets]\n\n # in training, the model returns a dict with all individual losses for\n # the classifier, bounding boxes, masks, region proposals and objectness\n loss_dict = model(images, targets)\n\n # sum the losses to one value\n losses = sum(loss for loss in loss_dict.values())\n\n # backprop and optimize\n optimizer.zero_grad()\n losses.backward()\n optimizer.step()\n\n # step the learning rate scheduler\n lr_scheduler.step()\n\n print(\n f"Epoch {epoch}/{num_epochs} Batch {i}/{len(dataset)//batch_size}" f", Loss: {losses.item()}"\n )\n```\n\nIf we put the model into evaluation mode with `model.eval()`, it will not anymore output the loss dictionary, but the predictions for class, bounding boxes, and masks. Perfect segmentations would look something like in the image below.\n\n![Segmentation examples form the iMaterialist Fine-Grained segmentation dataset.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/mask-rcnn/segmentation.jpg)\n\nUp to this point it was fairly easy because PyTorch provided these useful implementations out of the box. But as I mentioned earlier. I\'d also like to add the prediction of descriptive attributes to the model which is not implemented yet. I\'ll probably rewrite some of the torchvision detection model code to include another predictor head or modify the existing Fast R-CNN predictor to give me another output. This may take a while though, as I\'m fairly busy these days. Look out for it in a future post!\nThanks a lot for reading, I hope you had as much fun learning about instance segmentation and Mask R-CNNs as I had. Cheers!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Object Detection: YOLO",'Understand and implement the "You only look once" model',new Date("2021-10-11"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/yolocover.png","You only look once, so you better make the most of it!",["Data Science & AI/ML","Learning"],"As I already mentioned in a [previous post](https://pascal-bliem.com/blog/object%20detection%20metrics), after creating the [Doggo Snap](https://pascal-bliem.com/doggo-snap) app, I've become a lot more interested in computer vision tasks that can be solved by deep learning. One obvious example is object detection, where we want to detect certain objects in an image and figure out where exactly they are in the image. Imagine you're a robot with a camera as eyes and you need to figure out what to pick up and where it stands. That's where deep learning with convolutional neural network comes in very handy. In the [previous post](https://pascal-bliem.com/blog/object%20detection%20metrics), I've cover the concept of bounding boxes in object detection in detail, now we'll have a look at how we can build a model that predicts these bounding boxes around objects.\n\nObject detection used to be performed by a sliding window approach, in which a predefined box is slid over the image with a certain stride and every crop defined by the current position of the box is individually classified. This approach is, however, very computationally expensive because we have to \"look\" many time, for each new crop. [Region-proposing neural networks](https://arxiv.org/abs/1506.01497) were a bit faster but still slow. In the paper [\"You Only Look Once: Unified, Real-Time Object Detection\"](https://arxiv.org/abs/1506.02640), or short YOLO, the authors came up with a much more efficient way to predict bounding boxes. Since the original paper was published in 2016, there have been several updates to the YOLO architecture (I think there are 5 versions of it as of now), but I want to stick to the original version here to understand the fundamentals. In the following, I want to go through the idea behind YOLO, the implementation of the architecture, its loss function, and how it would be trained. But first, I want to say thanks a lot to [Aladdin Persson](https://www.youtube.com/channel/UCkzW5JSFwvKRjXABI-UTAkQ), a YouTuber who publishes really insightful deep learning videos, from which I learned a lot about deep learning, especially for computer vision and this implementation of YOLO. I'll assumer we'll be using the [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/) dataset here, which was also used in the original paper.\n\n### So how does that thing work?\n\nI think the authors describe what they did quite clearly in the [original paper](https://arxiv.org/abs/1506.02640), so go ahead and have a look at it if you're somewhat familiar with reading deep learning literature. Or just keep reading here, I'll try to summarize the basic idea behind the YOLO algorithm, and I'll try to use the same nomenclature as in the paper. So, we want to detect objects in images, which means we need to find the objects (predict bounding boxes) and figure out what kind of objects they are (perform a classification). What makes the YOLO algorithms particularly efficient is that, instead of performing several runs through a neural net for different parts of the image, we try to put out everything we need in one pass. This is done by splitting the image into a grid that has SxS cells (they used S=7, so 7x7 in the paper), and making each of these cells responsible for detecting only one object and outputting the corresponding bounding box. That means that if we would need to detect more smaller objects, we would need a finer grid (increase S). You can see this idea visualized in the figure below.\n\n![The concept behind the YOLO model (figure from the paper by Redmon et al.)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/yoloconcept.png)\n\nOf course, an object might be in several of these grid cells, so we need to find the one cell that contains the center point of the object, which will be responsible for outputting the bounding box for that object (the bounding box itself can reach beyond the cells boundaries). Imagine, each grid cell gets its own coordinate system and the predicted coordinates and dimensions are relative to it. So, if we output something like [x, y, width, height], the coordinates (x, y) of the objects center point will be within the cell in relative coordinates (which means between 0 and 1), but the relative width and height may be larger than the cell (which means they could be larger than 1). Even though only one object can be detected in each cell, several bounding boxes could be predicted for that object. In the paper the number of predicted bounding boxes is called B and set to equal 2. The idea behind that is that the different boxes could specialize on different characteristics, e.g. wide vs. tall objects. The output of one of the grid cells would then look something like [class_1, ..., class_c, confidence_1, x_1, y_1, width_1, height_1, confidence_2, x_2, y_2, width_2, height_2], if we have C classes and the class label is one-hot encoded. One of the class labels will be one, all others zero, depending on which class the object belongs to. A certainty as well as the center point coordinates, width, and height are outputted for each of the two predicted bounding boxes. Then the shape of the whole model output would be (S, S, C + 5 * B). Okay, that's the output, but how does the rest of the model look like?\n\n### The architecture\n\nThe model is a fairly standard and large convolutional neural network (CNN), inspired by [GoogleLeNet](https://arxiv.org/abs/1409.4842) as the authors say, with 24 convolutional layers followed by 2 fully connected layers. The architecture is shown in the figure below.\n\n![The YOLO model architecture (figure from the paper by Redmon et al.)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/yoloarchitecture.png)\n\nThe network's convolutional part consists of several blocks of convolutional and max-pooling layers. The dimensions of the filter kernels, input and output channels, and strides can be read from the figure. The figure doesn't show padding thought. To satisfy the relationship between input and output dimensions of the convolution blocks, some padding is necessary. We can calculate the dimensions according to the formula `output_size = [( input_size \u2013 kernel_width + 2 * padding) / stride] + 1`. There needs to be a padding of 3 on the first input layer and same-padding for the rest of the convolutional blocks. Note that the final output now coincides with the shape of the predictions we've shown above; width S=7, B=2, and C=20, that's (7, 7, 30).\n\nLet's see how to implement this in code, using [PyTorch](https://www.pytorch.org), my favorite Python deep learning framework. We'll slightly deviate from the original implementation and use batch norm here, which was not used in the original paper.Batch normalization can usually speed up training massively by preventing internal covariate shift and making the optimization function space simpler. We will first define the architecture. Below is a list containing further lists of which each represents a layer in the convolutional stack. The entries in these list stand for [kernel_size, out_channels, stride, padding] of each layer. Max-pooling layer, which will always have a kernel size of 2x2 and a stride of 2, are represented by a \"M\". Not in this list, but implemented later, are also the two fully connected layers.\n\n```python\nyolo_architecture = [\n [7, 64, 2, 3],\n \"M\",\n [3, 192, 1, 1],\n \"M\",\n [1, 128, 1, 0],\n [3, 256, 1, 1],\n [1, 256, 1, 0],\n [3, 512, 1, 1],\n \"M\",\n [1, 256, 1, 0],\n [3, 512, 1, 1],\n [1, 256, 1, 0],\n [3, 512, 1, 1],\n [1, 256, 1, 0],\n [3, 512, 1, 1],\n [1, 256, 1, 0],\n [3, 512, 1, 1],\n [1, 512, 1, 0],\n [3, 1024, 1, 1],\n \"M\",\n [1, 512, 1, 0],\n [3, 1024, 1, 1],\n [1, 512, 1, 0],\n [3, 1024, 1, 1],\n [3, 1024, 1, 1],\n [3, 1024, 2, 1],\n [3, 1024, 1, 1],\n [3, 1024, 1, 1],\n]\n# fully connected layer are not considered here yet, they'll be implemented individually\n```\n\nNow let's build a network from this architecture. I'll try to keep telling the story with in-line comments in the code.\n\n```python\nfrom typing import List, Union, Tuple\nimport torch\nimport torch.nn as nn\n\n# a class that represents one of the CNN blocks in the network\nclass CNNBlock(nn.Module):\n def __init__(self, in_channels: int, out_channels: int, **kwargs) -> None:\n super(CNNBlock, self).__init__()\n\n # the convolution itself\n self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)\n # followed by batch normalization\n self.batchnorm = nn.BatchNorm2d(out_channels)\n # and the activation function\n self.leakyrelu = nn.LeakyReLU(0.1)\n\n def forward(self, x: torch.Tensor) -> torch.Tensor:\n x = self.conv(x)\n x = self.batchnorm(x)\n return self.leakyrelu(x)\n\n# this is the actual model itself\nclass Yolo(nn.Module):\n # all parameters have defaults according to the paper\n # in_channels are the color channels of RGB images\n # grid_size, num_boxes, num_classes are S, B, C\n def __init__(\n self,\n architecture: List[Union[List[int], str]] = yolo_architecture,\n in_channels: int = 3,\n grid_size: int = 7,\n num_boxes: int = 2,\n num_classes: int = 20,\n **kwargs\n ) -> None:\n super(Yolo, self).__init__()\n\n self.architecture = architecture\n self.in_channels = in_channels\n # we'll have separate functions for creating the layers\n self.conv_layers = self._create_conv_layers(self.architecture)\n self.fc_layers = self._create_fully_connected_layers(\n grid_size, num_boxes, num_classes\n )\n\n def forward(self, x: torch.Tensor) -> torch.Tensor:\n x = self.conv_layers\n # start_dim=1 because we don't want to flatten the batch size\n x = torch.flatten(x, start_dim=1)\n return self.fc_layers(x)\n\n # create the convolutional part of the network\n def _create_conv_layers(\n self,\n architecture: List[Union[List[int], str]]\n ) -> nn.Sequential:\n layers = []\n\n in_channels = self.in_channels\n\n # we loop through the architecture list and create each layer\n for layer in architecture:\n # if layer is a list, we know it's a conv layer\n if type(layer) == list:\n # add a CNNBlock\n layers += [\n CNNBlock(\n in_channels=in_channels,\n out_channels=layer[1],\n kernel_size=layer[0],\n stride=layer[2],\n padding=layer[3])\n ]\n # set in_channels for the next layer\n # to out_channels of current layer\n in_channels = layer[1]\n # if it's a max-pooling layer\n elif type(layer) == str:\n layers += [nn.MaxPool2d(kernel_size=2, stride=2)]\n\n # return all layers as a sequential model part\n return nn.Sequential(*layers)\n\n # create the convolutional part of the network\n def _create_fully_connected_layers(\n self,\n grid_size: int,\n num_boxes: int,\n num_classes: int\n ) -> nn.Sequential:\n S, B, C = grid_size, num_boxes, num_classes\n return nn.Sequential(\n nn.Flatten(),\n nn.Linear(1024 * S * S, 4096),\n nn.Dropout(0.5),\n nn.LeakyReLU(0.1),\n # this will be reshaped later to be of shape (S, S, C + B * 5)\n nn.Linear(4096, S * S (C + B * 5))\n )\n```\n\nThat's basically the model architecture. Not actually that complicated, right? The interesting part comes now, where we'll see which loss function is responsible for training the model.\n\n### The loss function\n\nLet's think again about what exactly the model is doing. For each cell in the SxS grid, we want to see if there's an object or not and if yes, classify to which class it belongs. We also want to finds the objects center point and draw a bounding box around it. All these parts of the problem are cast into the the loss function, which can be seen in the figure below. It looks complicated at first, but it makes a lot of sense when we go through it bit by bit.\n\n![The YOLO loss function (figure from the paper by Redmon et al.)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/yololoss.png)\n\nThe overall loss function is composed of several contributing parts of square losses. The squaring of the loss terms has the effect, that large deviations from the ground truth will be penalized much more. Some of the loss terms are prefixed by `lambda_coord`, which is just a multiplication constant to prioritize these loss terms a bit higher, because we want the model to put particular emphasize on getting the location of the boxes right. We'll set it to 5. The first term is for the center point coordinates; we can see we sum over the amount or grid cells (SxS) and number of predicted boxes per cell (B=2 in our case). The identity function in front of the loss term is either 1, if there was a target bounding box in the i-th cell and the j-th predicted box was responsible for outputting that box (meaning it had the highest IOU out of all predictors in that grid cell), else it is 0. The second term is basically the same, but for the width and height of the bounding box. Note that we take the square roots of height and width to make sure that we prioritize smaller bounding boxes equally much to larger ones. In the third term, `C_i` is either 1 or zero, depending on if there is an object in the cell or not, and `C^_i` is the predicted probability that there is an object in the cell. The fourth term is basically the same as the third, but for the case that there is no object in the cell. We want to penalize a prediction for an object if there actually is none. There's again a multiplication constant to prioritize this term a bit less; in this case we'll set it 0.5. The last term is for the classification, so if we get the class of the object right. Interestingly, instead of a common cross-entropy loss, the authors use a simple regression loss here as well.\n\nNow let's have a look at how we can implement this custom loss function in code. We'll also use the `intersection_over_union()` function from the [previous post](https://pascal-bliem.com/blog/object%20detection%20metrics).\n\n```python\nclass YoloLoss(nn.Module):\n def __init__(\n self,\n grid_size: int = 7,\n num_boxes: int = 2,\n num_classes: int = 20\n ) -> None:\n # Note that I'm pretending that these parameters could be varied\n # from their defaults, but actually I'm treating them as if\n # they're hard-coded constants here to make the implementation\n # simpler and make the concepts clearer to understand.\n super(YoloLoss, self).__init__()\n\n # we use summed square losses\n self.mse = nn.MSELoss(reduction=\"sum\")\n self.S = grid_size\n self.B = num_boxes\n self.C = num_classes\n # the prioritization multipliers as described above\n self.lambda_noobj = 0.5\n self.lamda_coord = 5\n\n def forward(\n self,\n predictions: torch.Tensor,\n target: torch.Tensor\n ) -> torch.Tensor:\n # reshape the predictions to (batch_size, S, S, C + B * 5)\n predictions = predictions.reshape(\n -1, self.S, self.S, self.C + self.B * 5\n )\n\n # calculate the IOU of the two predicted bboxes with the target bbox\n iou_b1 = intersection_over_union(\n predictions[..., 21:25], target[..., 21:25]\n )\n iou_b2 = intersection_over_union(\n predictions[..., 26:30], target[..., 21:25]\n )\n ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)\n\n # best_box is the argmax and will either be 0 or 1 if B=2,\n # depending which of the two predicted bboxes has the higher IOU\n iou_maxes, best_box = torch.max(ious, dim=0)\n # in the paper, this is the identity function I_obj_i\n # that tells us if there is an object in cell i\n exists_box = target[..., 20].unsqueeze(3)\n\n ### Loss for box coordinates ###\n # select predicted coordinates, width, and hight for the best_box\n box_predictions = exists_box * (\n # this is 0 if the 0th bbox was best\n best_box * predictions[..., 26:30]\n # this is 0 if the 1st bbox was best\n + (1 - best_box) * predictions[..., 21:25]\n )\n # same for target box\n box_targets = exists_box * target[..., 21:25]\n\n # we take the sqrt of width and height\n box_predictions[..., 2:4] = (\n torch.sign(box_predictions[..., 2:4])\n * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6))\n )\n box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])\n\n # calculate the summed squared loss\n box_loss = self.mse(\n # we flatten this here from (batch_size, S, S, 4) to\n # (batch_size*S*S, 4) because the MSE will sum up the\n # losses of all batch_size examples and all S*S cells\n torch.flatten(box_predictions, end_dim=-2),\n torch.flatten(box_targets, end_dim=-2)\n )\n\n ### Loss for object ###\n # get prediction probability/confidence of best_box\n pred_box = (\n best_box * predictions[..., 25:26]\n + (1 - best_box) * predictions[..., 20:21]\n )\n\n object_loss = self.mse(\n # same as above, we flatten to (batch_size*S*S*1)\n torch.flatten(exists_box * pred_box),\n torch.flatten(exists_box * target[..., 20:21])\n )\n\n ### Loss for no object ###\n\n # if there is no object, both predicted boxes should know that\n # there is no object, hence, we consider loss from both boxes here\n no_object_loss = self.mse(\n torch.flatten((1 - exists_box) * predictions[..., 20:21]),\n torch.flatten((1 - exists_box) * target[..., 20:21])\n )\n\n no_object_loss += self.mse(\n torch.flatten((1 - exists_box) * predictions[..., 25:26]),\n torch.flatten((1 - exists_box) * target[..., 20:21])\n )\n\n ### Loss for classification ###\n class_loss = self.mse(\n # flatten to (batch_size*S*S, 20)\n torch.flatten(exists_box * predictions[...,:20], end_dim=-2),\n torch.flatten(exists_box * target[...,:20], end_dim=-2)\n )\n\n ### Final Loss ###\n # combine all the loss terms\n loss = (\n self.lambda_coord * box_loss\n + object_loss\n + self.lambda_noobj * no_object_loss\n + class_loss\n )\n\n return loss\n```\n\nThat was pretty much the trickiest part. Now we can almost start training, just have to the the data first.\n\n### Getting the training data\n\nAs I mentioned in the introduction, the original YOLO was trained on the [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/) dataset, which I also like because it has my name in it. This is a dataset with around 43000 images and labels that specify to which of the 20 classes the object belongs and where the bounding boxes should be located. You can have a look at some examples from the 20 classes (airplanes, people, plants, chairs etc.) in the image below. Getting the data from the original source is a bit of a hassle, but luckily, [Aladdin Persson](https://www.youtube.com/channel/UCkzW5JSFwvKRjXABI-UTAkQ) has uploaded a prepared version of the dataset on [Kaggle](https://www.kaggle.com/dataset/734b7bcb7ef13a045cbdd007a3c19874c2586ed0b02b4afc86126e89d00af8d2), from where you can download it.\n\n![Some samples from the PASCAL VOC dataset.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/pascalvocimages.png)\n\nThe dataset is distributed over two directories, one which holds the bare images, and one which has one text file for each image containing the labels for that image. There also is a CSV file, mapping the image files to their respective label files. For each object in an image, there is one line in the label file that has 5 columns, one for the class label, the center point coordinates (x, y), width, and height. The coordinates and dimensions are relative (between 0 and 1) to the whole image, which is convenient, because we'll be rescaling the images. We will need to convert the coordinates for the whole image to coordinates relative to the cells in the SxS grid, though. PyTorch allows us to define custom datasets, so we can implement all our special needs into its `__getitem__()` function. Let's code it.\n\n```python\nimport os\nimport pandas\nfrom PIL import Image\n\nclass Dataset(torch.utils.data.Dataset):\n # the mapping file is a csv that provides a\n # mapping from the image to its label file\n def __init__(\n self,\n mapping_path: str,\n img_path: str,\n label_path: str,\n S: int = 7,\n B: int = 2,\n C: int = 2,\n transform: nn.Module = None\n ) -> None:\n # read the image-label-mapping from csv\n self.mapping = pd.read_csv(mapping_path)\n self.img_path = img_path\n self.label_path = label_path\n self.S = S\n self.B = B\n self.C = C\n\n def __len__(self):\n return len(self.mapping)\n\n # we only have to implement what to do for a single item\n # in the dataset and can then get it by index\n def __getitem__(self, index: int) -> Tuple[Image, torch.Tensor]:\n # get the label path for item of index\n label_path = os.path.join(\n self.label_path, self.mapping.iloc[index, 1]\n )\n\n # get all objects and their bboxes for that item\n bboxes = []\n with open(label_path) as file:\n for line in file.readlines():\n class_label, x, y, width, height = [\n float(l) for l in line.replace(\"\n\", \"\").split()\n ]\n class_label = int(class_label)\n bboxes.append([class_label, x, y, width, height])\n\n # get the image path for item of index\n img_path = os.path.join(\n self.img_path, self.mapping.iloc[index, 0]\n )\n\n image = Image.open(img_path)\n # cast to tensor in case we need to do transformations\n bboxes = torch.tensor(bboxes)\n\n # if there are any transformations, apply them to both\n # the image and the bounding boxes\n if self.transform:\n image, bboxes = self.transform(image, bboxes)\n\n # The following corresponds to the output of the model,\n # the SxS grid times number of classes plus B-times prediction\n # certainty, midpoint coordinates (x, y), width and height.\n # From the B boxes, actually only one is used because we only\n # have one ground truth label here, but we need its shape to match\n # the predictions, where we output B=2 bounding box candidates.\n target_matrix = torch.zeros([self.S, self.S, self.C + 5 * self.B])\n for bbox in bboxes:\n class_label, x, y, width, height = bbox.tolist()\n class_label = int(class_label)\n # in the SxS grid, i is row index, j is column index\n # we cast to int the get the cell the center point is in\n i, j = int(self.S * y), int(self.S * x)\n # calculate (x,y) relative to the cell coordinate system\n x_cell, y_cell = self.S * x - j, self.S * y - i\n # get width and height relative to the cell coordinate system\n width_cell, height_cell = width * self.S, height * self.S\n\n # one cell can be responsible for only one object,\n # so if there is currently no object in cell (i,j)\n if target_matrix[i, j, 20] == 0:\n # then there is one now\n target_matrix[i, j, 20] == 1\n # set the bbox coordinates and class label\n bbox_coordinates = torch.tensor(\n [x_cell, y_cell, width_cell, height_cell]\n )\n target_matrix[i, j, 21:25] = bbox_coordinates\n target_matrix[i, j, class_label] = 1\n\n # return image and target which now has the (S, S, C + B * 5)\n # shape that we've discussed in the architecture section\n return image, target_matrix\n\n```\n\nNow that we've got the data ready, let's train the model.\n\n### Training\n\nWe have set up almost everything we need already. We'll set some hyperparameters, instantiate our model, loss function, an optimizer and data loader and run a fairly standard PyTorch training loop.\n\n```python\nimport torchvision.transforms as transforms\nimport torch.optim as optim\nfrom torch.utils.data import DataLoader\n# tqdm is a convenient package for a dynamic progress bar\nfrom tqdm import tqdm\n\ntorch.manual_seed(42)\n\n# hyperparameters\nlearning_rate = 0.0005\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\nbatch_size = 64\nweight_decay = 0\nepochs = 100\nnum_workers = 2\npin_memory = True\nimage_path = \"path/to/images\"\nlabel_path = \"path/to/labels\"\n\n# write a custom compose for transforms that not only transforms\n# the image but also the bounding boxes accordingly\nclass Compose(object):\n def __init__(self, transforms: List[nn.Module]) -> None:\n self.transforms = transforms\n\n def __call__(\n self,\n img: Union[torch.Tensor, Image],\n bboxes: torch.Tensor\n ) -> None:\n for transform in transforms:\n img, bboxes = transform(img, bboxes)\n\n# resize the image to match the input size of the model\ntransform = Compose([transforms.Resize([448, 448]), transforms.ToTensor()])\n\n# the training function\ndef train(\n train_loader: DataLoader,\n model: Yolo,\n optimizer: optim.Optimizer,\n loss_function: YoloLoss\n) -> None:\n model = model.to(device)\n # set up progress bar\n loop = tqdm(train_loader, leave=True)\n mean_loss = []\n\n # iterate over each batch, x and y are the\n # model inputs and targets, respectively\n for batch_idx, (x, y) in enumerate(loop):\n x, y = x.to(device), y.to(device)\n # make prediction\n out = model(x)\n # calculate loss\n loss = loss_function(out, y)\n mean_loss.append(loss.item())\n # step gradient\n optimizer.zero_grad()\n loss.backward()\n optimizer.step()\n # update progressbar\n loop.set_postfix(loss=loss.item())\n\n print(f\"Mean loss is {sum(mean_loss)/len(mean_loss)}\")\n\n# instantiate model, optimizer, and loss function\nmodel = Yolo(grid_size=7, num_boxes=2, num_classes=20).to(device)\noptimizer = optim.Adam(\n model.parameters(),\n lr=learning_rate,\n weight_decay=weight_decay\n)\nloss_function = YoloLoss()\n\n# prepare training dataset\ntrain_dataset = Dataset(\"path/to/mapping\", image_path, label_path)\ntrain_loader = DataLoader(\n dataset=train_dataset,\n batch_size=batch_size,\n num_workers=num_workers,\n pin_memory=pin_memory,\n shuffle=True,\n drop_last=True\n)\n\n# train the model\nfor epoch in range(epochs):\n train(train_loader, model, optimizer, loss_function)\n\n```\n\nAnd that's the training. We can set up a data loader for the test set and perform model evaluation on it in almost the same way, except that we don't need to optimize anything then and should set the model to evaluation mode with `model.eval()` and disable gradient tracking by performing the calculation in a `with torch.no_grad():` block. Additionally, we could also take into account some learning rate scheduling, which may help with training. As you may have imagined, I'm not actually fully training the model here, because it is pretty huge and I don't want to spend the time and resources on it. The authors of the YOLO paper pretrained their CNN on [ImageNet](https://www.image-net.org/) for a week before they even started switching the training task to detection. Instead of wasting a lot of time and money here, if we had a real task at hand now, we could start from an already pretrained model. The YOLO architectures are pretty well know and you can download the latest version 5 in different sizes and with pretrained weights from the PyTorch [model hub](https://pytorch.org/hub/ultralytics_yolov5/). Anyways, the output would look something like what you can see in the image below :)\n\n![Some examples of object detection on artwork and natural images (figure from the paper by Redmon et al.)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/yolooutput.png)\n\n### Conclusion\n\nIn this post, we've explored the famous \"You only look once\" (YOLO) model for object detection with deep learning. We've discussed the general working principle behind YOLO, how we manage to detect multiple objects in one \"run\" through the network by splitting the images into cells in a grid and predicting bounding boxes for each of these cells. We have implemented the architecture of the model and understood the individual terms contributing to the special YOLO loss function. We've then set up a custom dataset implementation for the PASCAL VOC dataset as well as a simple training loop to train the model. Since the original YOLO, several improvements have been made to the algorithm, so if you want to use YOLO in a project, check out the latest version. I, again, want to thank [Aladdin Persson](https://www.youtube.com/channel/UCkzW5JSFwvKRjXABI-UTAkQ), whose videos on deep learning have helped me a lot to understand the whole topic of object detection. And, of course, thank you for reading!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Object Detection: Metrics","A couple of things we need to know for evaluating object detection models",new Date("2021-08-16"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/boundingboxes.png","Object detection in action!",["Data Science & AI/ML","Learning"],"After my last big project, [Doggo Snap](https://pascal-bliem.com/doggo-snap), a mobile app which uses deep learning to classify dog breeds from photos, I've gotten really exited about the field of computer vision. There is still a lot to explore beyond simple image classification. After being able to tell if something, such as a certain dog breed, is in an image, it would also be nice to tell where in the image it is. This is the problem of object detection or image segmentation. We want to be able to not only classify one or more objects or a section in an image, but also to draw an accurate bounding box around them or detect if a pixel belongs to a certain class. I want to focus on object detection with bounding boxes for now, as can be seen in the title image above.\n\nTraditionally this used to be performed by a sliding window approach, in which a predefined box is slid over the image with a certain stride and every crop defined by the current position of the box is individually classified. This approach is, however, very computationally expensive because we have to \"look\" many time, for each new crop. [Region-proposing neural networks](https://arxiv.org/abs/1506.01497) were a bit faster but still slow. Would be nice if we only had to look once, right? This is also what the authors of the paper [\"You Only Look Once: Unified, Real-Time Object Detection\"](https://arxiv.org/abs/1506.02640), or short YOLO, thought when they came up with a much more efficient algorithm to perform object detection. I want to get into the YOLO neural network architecture in a later post though. For now, it is important to cover some metrics first, which are crucial for understanding how we can evaluate the bounding boxes that our model predicts. Before I continue here I want to say huge thanks to [Aladdin Persson](https://www.youtube.com/channel/UCkzW5JSFwvKRjXABI-UTAkQ), who has a YouTube channel on which he publishes fantastically educative videos on deep learning, from which I learned a lot about computer vision with deep neural networks.\n\nIn the following, we'll cover three metrics that kind of build up on each other and, eventually, will allow us to score the performance of the predictions we get, and on the way there, solve the problem that we may potentially get multiple bounding boxes for the same object. Those metrics are\n\n- Intersection over Union (IOU)\n- Non-max suppression\n- Mean Average Precision (mAP)\n\nLet's have a look at them one by one.\n\n### Intersection over Union\n\nGiven that we have labeled data to train our models, we'll need to compare the label/target classifications and bounding boxes with the predictions made by the model. The classification part is easy, either we predicted the right class or not, maybe taking the prediction certainty into account. How about the bounding boxes? We want the predictions to overlap as much as possible with the targets. So, we want the common area of the two boxes, their intersection, to be relatively large compared to the combined area of the target and prediction box, their union. The ration of the two is the intersection-over-union (IOU). You can see this in the example below. The green shaded area, which both the target bounding box and the predicted bounding box have in common, is the intersection. When calculating the union, we have to keep in mind to only count this area once in total, not once per bounding box. You can now see that if the boxes wouldn't overlap at all, the IOU would equal 0, if they would overlap perfectly, the IOU would equal 1. I guess somewhere around 0.5, the prediction would become passable.\n\n![A target and a predicted bounding box for detecting a beer glass. From this we can calculate the IOU](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/koelschbox.png)\n\nCalculating the union is trivial, but how do we calculate the intersection? We need to find the box coordinates that define it. To do so we get the top left edge by taking the maximum of each the x-coordinates and the y-coordinates of the target and predicted box. For the lower right point, we do the same but take the minimum instead. Let's see how we'd implement this in code. Throughout this (and the next) post, I'll use [PyTorch](https://pytorch.org/), my favorite tensor library and deep learning framework in Python. Note that the bounding box coordinate format, [x, y, w, h], I will use here is not describing the top left and bottom right edge of the box but rather the center point of the box and its width and height. This is the format also used in the [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/) dataset, which was used in the original YOLO paper (and carries my name :p). I'll try to keep telling the story with inline-comments:\n\n```python\nfrom typing import List\nimport torch\n\ndef intersection_over_union(\n bboxes_pred: torch.Tensor,\n bboxes_target: torch.Tensor\n) -> torch.Tensor:\n \"\"\"Calculates the intersection-over-union (IOU)\n of target and predicted bounding boxes.\n\n Args:\n bboxes_pred: Tensor of shape (N, 4), containing N\n predicted bounding boxes, each [x, y, w, h]\n bboxes_target: Tensor of shape (N, 4), containing N\n target bounding boxes, each [x, y, w, h]\n\n Returns:\n iou: The intersection-over-union metric\n \"\"\"\n\n # convert the center-point-width-height representation of the\n # boxes to a top-left-edge-bottom-right-edge representation\n box1_x1 = bboxes_pred[..., 0:1] - bboxes_pred[..., 2:3] / 2\n box1_x2 = bboxes_pred[..., 1:2] - bboxes_pred[..., 3:4] / 2\n box1_x2 = bboxes_pred[..., 0:1] + bboxes_pred[..., 2:3] / 2\n box1_y2 = bboxes_pred[..., 1:2] + bboxes_pred[..., 3:4] / 2\n box2_x1 = bboxes_target[..., 0:1] - bboxes_target[..., 2:3] / 2\n box2_x2 = bboxes_target[..., 1:2] - bboxes_target[..., 3:4] / 2\n box2_x2 = bboxes_target[..., 0:1] + bboxes_target[..., 2:3] / 2\n box2_y2 = bboxes_target[..., 1:2] + bboxes_target[..., 3:4] / 2\n\n # get the top-left (x1, y1) and bottom-right (x2, y2)\n # points that define the intersection box\n x1 = torch.max(box1_x1, box2_x1)\n y1 = torch.max(box1_y1, box2_y1)\n x2 = torch.min(box1_x2, box2_x2)\n y2 = torch.min(box1_y2, box2_y2)\n\n # calculate the intersection area by multiplying the sides of the rectangle\n # (the clamp(0) is for the case that the boxes don't intersect at all)\n intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)\n\n # get the individual box areas\n box1_area = abs((box1_x2 - box1_x1) * (box1_y2 -box1_y1))\n box2_area = abs((box2_x2 - box2_x1) * (box2_y2 -box2_y1))\n\n # calculate the union (don't forget to not count the intersecting area double)\n union = box1_area + box2_area - intersection\n\n return intersection / union\n```\n\n### Non-max Suppression\n\nThe next concept we're going to talk about is non-max suppression. This isn't really a metric, but rather a technique to solve the following problem: What is we get many bounding box predictions that look like they're largely overlapping and meant for the same object? How are we going to choose the right one? We can see this example in the figure below. We will usually build our models in a way that they output a certainty score for a classification, a probability that states how likely our model thinks that the object actually belongs to the predicted class. The higher this score, the more certain the model is that what is inside of the box actually belongs to the predicted class. We can describe each prediction as an array with 6 elements: bbox = [class, certainty, x, y, width, height]. Knowing this, non-max suppression is actually pretty easy to understand: For all boxes predicted for a certain class, we check them pair-wise (starting from the one with the highest certainty) and calculate their IOU. If it is over a certain threshold (e.g. 0.5) we consider those boxes redundant and just remove the one with the lower predicted certainty. We then repeat this for all classes. We can also set a certainty/probability threshold to filter out unlikely boxes right from the start.\n\n![Non-max suppression: from bounding boxes that have a minimum overlap (IOU), we only take the one with the highest predicted certainty.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/koelschnonmax.png)\n\nNow let's see how we could implement this in code:\n\n```python\ndef non_max_suppression(\n bboxes: List[List[float]],\n iou_threshold: float,\n cert_threshold: float,\n) -> List[List[float]]:\n \"\"\"Perform non-max suppression on predicted bounding boxes.\n\n Args:\n bboxes: List of shape (N, 6), containing N predicted\n bounding boxes, each [class, certainty, x, y, w, h]\n iou_threshold: IOU Threshold\n iou_threshold: Certainty Score Threshold\n\n Returns:\n remaining_bboxes: Bounding boxes remaining after\n non-max suppression is performed\n \"\"\"\n\n # bboxes will be of shape [[class, certainty, x, y, width, height], ...]\n # filter out all boxes below certainty threshold\n bboxes = [box for box in bboxes if box[1] > cert_threshold]\n\n # this will be the output after non-max suppression\n remaining_bboxes = []\n\n # put bbox with highest certainty to the beginning of the list\n bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)\n\n # take bboxes one by one starting from highest certainties\n while bboxes:\n chosen_box = bboxes.pop(0)\n\n # filter out lower-certainty boxes of the same class\n # that have sufficiently high IOU with the chosen box\n bboxes = [\n box for box in bboxes\n # keep them if they're of different classes\n if box[0] != chosen_box[0]\n # or if they're below the iou_threshold\n or intersection_over_union(\n torch.tensor(chosen_box[2:]),\n torch.tensor(box[2:]),\n ) < iou_threshold\n ]\n\n remaining_bboxes.append(chosen_box)\n\n return remaining_bboxes\n```\n\n### Mean Average Precision (mAP)\n\nNow this final metric is what we'll actually use for scoring the performance of our object detection model. The mean Average Precision (mAP) is basically the integration of the precision-recall-curve of a model. Anyone who as done anything with machine learning has probably encountered precision and recall, but let's quickly recap it here. Precision tells us, among all positive predictions we make, how many of those are true positives: precision = true positives / (true positives + false positives). The recall tells us, among all positive examples in the data, how many did the model predict to be positive: recall = true positives / (true positives + false negatives). The tradeoff between these two metrics is obvious. If we want a higher precision, we can try to make our model stricter so that it will make positives predictions with more care, only predicting the ones with high certainty. At the same time, that means that probably more data points will be missed by it and false negatives increase, hence, recall decreases. A generally good model should have a good precision and a good recall at the same time, or in other words, if precision and recall are plotted against each other on a curve, the area under that curve should be as large as possible. This area is also called the average precision (AP). If we calculate this AP for each class that the model can potentially classify and take the mean of those results, we end up with the mean average precision (mAP). You can see this visualized in the image below; the precision-recall-curves for different classes are plotted and the overall model performance can be evaluated by the mean of the integrals of these curves.\n\n![Precision-Recall-Curves of different classes classified by model are used to calculate the mean Average Precision (mAP)](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/object-detection/mAP.png)\n\nBut how to we apply this to the problem of object detection. As mentioned before, we'll probably be in a situation in which we have labeled data with target bounding boxes given as ground truth and bounding boxes predicted by our model. We want to know how close to the targets the predictions are. As we know now, we need to calculate the precision-recall-curve per class, and for that we need to first know which predictions we consider a true or false positives. For that we can simply calculate the IOU of the target and predicted box and see if it lies above a defined threshold. Now we calculate the curve by sorting the predictions by confidence score from the highest to the lowest, and then we go through that sequence and calculate precision and recall cumulatively, always up to the current step in the sequence. The resulting values are points on the precision-recall-curve and we can integrate the curve. We then repeat the procedure for all classes and take the mean. We may even run this entire algorithm for several different IOU thresholds.\n\nNow let's see how we would implement that in code. Note that now (different from the previous code cell), each prediction will be a list of seven elements because here we score on all images in a dataset and we want to make sure that the target and prediction boxes actually belong to the same image, therefore, we include an image index:\n\n```python\nfrom collections import Counter\n\ndef mean_average_precision(\n pred_boxes: List[List[float]],\n target_boxes: List[List[float]],\n iou_threshold: float = 0.5,\n num_classes: int = 20\n) -> float:\n \"\"\"Calculate mean Average Precision (mAP) score.\n\n Args:\n pred_boxes: List of shape (N, 7), containing N predicted\n bounding boxes, each [img_idx, class, certainty, x, y, w, h]\n target_boxes: List of shape (N, 7), containing N target\n bounding boxes, each [img_idx, class, certainty, x, y, w, h]\n iou_threshold: Threshold for IOU between target and predicted\n bounding box\n num_classes: The number of classes used by the model\n\n Returns:\n mAP: mean Average Precision score\n \"\"\"\n\n average_precisions = []\n\n # loop through all classes\n for class_idx in range(num_classes):\n predictions = []\n targets = []\n\n # get all the predicted and target bboxes\n # belonging to current class\n for prediction in pred_boxes:\n if prediction[1] == class_idx:\n predictions.append(prediction)\n\n for target in target_boxes:\n if target[1] == class_idx:\n targets.append(target)\n\n # count how many target boxes appear in each image of the dataset\n count_target_boxes = Counter([t[0] for t in targets])\n\n # we need to keep track of the target bounding box that have already\n # been covered, so that in case there are several bbox predictions for\n # the same target bbox, only the first one with the highest certainty\n # will be considered true and the others false;\n # If the first img has 2 bboxes and the second img has 4 bboxes,\n # after the loop below, count_target_boxes will look like\n # {0:torch.tensor([0,0]), 1:torch.tensor([0,0,0,0]), ... }\n for key, val in count_target_boxes.items():\n count_target_boxes[key] = torch.zeros(val)\n\n # sort predictions over the certainties, starting from highest\n predictions = sorted(predictions, key=lambda x: x[2], reverse=True)\n\n # tensors to keep track of true and false positives\n true_positives = torch.zeros(len(predictions))\n false_positives = torch.zeros(len(predictions))\n\n all_target_bboxes = len(targets)\n\n # now we pick one prediction at a time\n for prediction_idx, prediction in enumerate(predictions):\n\n # and get the target bboxes from the corresponding image\n targets_corresponding_img = [\n bbox for bbox in targets if bbox[0] == prediction[0]\n ]\n\n # now this is the length of all target bboxes in that image\n num_targets = len(targets_corresponding_img)\n\n # for each of the target bboxes, calculate its IOU with\n # the current prediction and find the best\n best_iou = 0\n for target_idx, target in enumerate(targets_corresponding_img):\n iou = intersection_over_union(\n torch.tensor(prediction[3:]),\n torch.tensor(target[3:]),\n )\n if iou > best_iou:\n best_iou = iou\n best_target_idx = target_idx\n\n if best_iou > iou_threshold:\n if count_target_boxes[prediction[0]][best_target_idx] == 0:\n # has not yet been covered, so its a true positive\n true_positives[prediction_idx] = 1\n # now we check it as covered so that\n # it won't get covered again later\n count_target_boxes[prediction[0]][best_target_idx] = 1\n else:\n # its a false positive\n false_positives[prediction_idx] = 1\n else:\n # its a false positive\n false_positives[prediction_idx] = 1\n\n # calculate the cumulative sums to be able to calculate\n # precision and recall up to each prediction in the sequence\n true_positives_cumsum = torch.cumsum(true_positives, dim=0)\n false_positives_cumsum = torch.cumsum(false_positives, dim=0)\n recalls = torch.divide(true_positives_cumsum, (all_target_bboxes))\n precisions = torch.divide(\n true_positives_cumsum, \n (true_positives_cumsum + false_positives_cumsum)\n )\n\n # for integrating the precision-recall curve, we'll also\n # need a point (0,1), so we need to concat these values\n recalls = torch.cat((torch.tensor([0]), recalls))\n precisions = torch.cat((torch.tensor([1]), precisions))\n\n # integrate the curve\n average_precisions.append(torch.trapz(precisions, recalls))\n\n # the the mean and return\n mAP = sum(average_precisions) / len(average_precisions)\n return mAP\n```\n\nAnd that's basically it. As mentioned above, we can now also calculate this metric for different IOU thresholds e.g. from 0.5 to 0.95 in steps of 0.05.\n\n### Conclusion\n\nLet's quickly recap the concepts we covered: We've seen that the fundamental metric that tells us how much predicted bounding boxes overlap with target boxes is the intersection-over-union (IOU). We solved the problem of having potentially many bounding boxes predicted for the same object with non-max suppression. Finally we've seen how the overall model can be scored with mean Average Precision (mAP). Now that we know how to handle bounding boxes that are predicted by an object detection model, we can have a look at an actual model. In an upcoming post I will have a look at the YOLO architecture, so stay tuned. Thanks a lot for reading!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Thanks for the feedback","A summary and discussion of the book of same title",new Date("2021-06-28"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/feedback/feedback.jpg","How to best use feedback to grow?",["Non-Tech","Learning"],"This is gonna be on feedback: how to get the most out of the feedback you receive and use it for your personal development, as discussed by Douglas Stone and Sheila Heen in their [book](https://www.amazon.com/Thanks-Feedback-Science-Receiving-Well/dp/0670014664) _Thanks for the Feedback - The Science and Art of receiving Feedback well (even when it is off base, unfair, poorly delivered, and, frankly, you're not in the mood)_. Yep, pretty long title, but it conveys the message quite clearly. Stone and Heen are both lecturers at the Harvard Law School and have studied how to get most value out of feedback in their research in the Harvard Negotiation Project. When I read non-fiction books, I usually like to take notes. This time, I thought I should condense my notes into a blog post, because I really enjoyed this book and found it super useful for extending my own understanding of feedback, both in terms of professional and personal relationships. So, this post is basically a summary and light discussion of the aforementioned book.\n\nBesides learning new technologies, programming languages, or frameworks, I think it is important for developers to sharpen their soft skills as well. Especially the once concerning interpersonal aspects. After all, most of us don't work completely isolated from other humans; we interact with our colleagues and customers and there's a lot they can teach us about ourselves and about what we could potentially improve in this interaction or the way we do our work in general. Probably most businesses have understood by now that it's important to give feedback to their employees if they want them to improve and managers are often trained in giving some sort of feedback to their teams. The receiving end is often overlooked, even though, as Stone and Heen argue in their book, it is the more important side for making feedback effective. Just because you give someone feedback, it doesn't mean that they'll act on it. Actually, receiving critical feedback can be pretty difficult, because it shows us that what we've been doing so far was not perceived as ideal. We often feel triggered by feedback we receive, which makes it hard to rationally look at it, but very easy to discard it. In the following, we'll have a look at why exactly we have a hard time with receiving feedback and what we can do to change that and use is for our personal growth instead.\n\n### Why can it be hard to receive feedback well?\n\nFeedback comes in many forms, some of it is certainly welcome. A clap on the shoulder, or being told \"hey you did great, keep going\" sure feels nice. But critical feedback? Getting an evaluation that does not match our expectations? Being told that the way we usually do our work, the way we've perfectionized over a long time, is actually suboptimal and requires some adjustments? Doesn't feel so nice. But what exactly is it that makes us feel bad about it? Stone and Heen have identified three main groups of triggers that can make it hard for us to receive feedback well. _Truth Trigger_ revolve around the actual content of the feedback. We may feel that it i simply wrong, unhelpful, or unaligned with our expectations. In consequence, we feel wronged and discard the feedback. _Relationship Triggers_ are activated by the particular person that is giving us the feedback. Our focus shifts from the feedback itself to how we thing that this person, in our relationship, shouldn't be giving us such feedback. Lastly, there are _Identity Triggers_. We feel attacked when the feedback contradicts the image of how we perceive ourselves. Let's discuss these three categories in more detail below.\n\n### Truth Triggers\n\nTo understand why we often feel that feedback is wrong, we should try to first understand what feedback actually is. According to the authors, there are three general types of feedback that have different purposes:\n\n- **Appreciation**: to motivate and encourage\n- **Coaching**: to helps increase knowledge, skill, capability etc. or raise feelings in relationships\n- **Evaluation**: to tell us where we stand, inform decision making, and align expectations\n\nWhile all of these are useful and necessary, we don't always want or expect all of them. If the giver and the receiver of feedback talk cross-purpose because they have different ideas on which type the conversation should be about, the feedback surely won't be received well. So the first thing we should be doing before even getting or giving feedback is to be mindful about what need/want or what we're being offered, and align our expectations. It might also be a good idea to actually cover the different purposes in separate conversations. Evaluation, especially if it's not the evaluation we were hoping for, can drown the rest. Imagine you get a bad performance review and then your manager tries to coach you on what you should improve. Probably you won't be listening to a single word of that coaching because your mind is occupied with thinking about what an idiot your boss is for giving you a bad review. Let some time pass so that you'll (hopefully) understand that it's a good idea to receive coaching so that your next performance review will be better.\n\nFeedback can be pretty vague; certainly not everyone is giving it to us in a fully comprehensible manner. Hence, it can be hard to see their point and see what may be true about their feedback. It is, however, very easy to spot what we think is wrong about it right away. But if it is obviously wrong, why would they give it in the first place? Maybe there is a tiny bit of truth in it. To find that tiny bit, discuss and understand where the feedback is coming from and where it is going to. May they have different data than us? Or a different interpretation of it? It's easy to see the same thing but put very different labels on it. Do they want to give us advice or inform us about expectations or consequences? We need to ask ourselves what is right and legit about their feedback, and see what concerns we have in common. If we can work together with the feedback giver, we can get a more complete picture and maximize what we'll (both) get out of the conversation.\n\nAnother reason that makes wrong-spotting in others' feedback for us so easy is that we simply don't see all the things about ourselves that others may be seeing. We all have some blind spots that are obvious to others or we may be unconsciously sending some signals that we are not aware of but that are easily picked up by others. Most of the time we are not aware of our facial expression or tone of voice. Maybe we think we're saying really nice things, but our face and voice are telling everybody else that we mean the opposite. We may be unaware of even big patterns of behavior. Let's think about what factors could cause or amplify our blind spots. When it comes to feedback, often we tend to discount our own emotions, but others may count them double. We also tend to attribute positive outcomes to our own efforts while we try to excuse negative outcomes with the situation or circumstances. Others may do the exact opposite and attribute negative outcomes to our character. They saw that we messed it up; must be our fault, why care about the circumstances. Another problem may be a gap between the intention of our actions and the actual impact they have on others. We judge ourselves by our intention, while others, of course, only care about our impact on them. It doesn't matter how good our will was if we try to fix a situation for others but end up making it worse. The problem is that in all these situations, we're usually not aware of the fact that others may perceive our actions so differently from how we do. To identify our own blind spots, we need the help of others. So let's invite our feedback givers to help us see ourselves and answer the question of how we are standing in our own way.\n\n### Relationships and Systems\n\nAs mentioned before, we're often more triggered by who is giving the feedback, rather than by its actual content. How we receive feedback depends a lot on how we think of the giver. Do we trust them or think that they're credible and have good judgement and skill? Imagine the \"business\" guy tells you that your code performs bad and needs to run faster. What does he know? He doesn't even know how to program and has never worked on a software development project. Easy to discard it right away. But hey, maybe you could optimize it and make it run a bit faster, regardless of who is telling you to do so. We also tend to care a lot about how we are treated by the feedback giver. Do we feel that we are accepted, appreciated, and that our autonomy is respected? If not, we're likely to not take the feedback well. Who wants to listen to a jerk's advice anyways. We'd rather give them some of our feedback, telling them how obnoxious we think they are. Before we know it, we're suddenly having a conversation with two totally different topics and are talking past each other. Stone and Heen call this phenomenon switch-tracking. Maybe topics in a feedback conversation need to be discussed both ways, but better not simultaneously. If we realize that we're switch-tracking, we need to give both topics their own tracks. It can be challenging to take feedback from people that we don't trust, e.g. strangers, and especially from people we find difficult. But people who see us at our worst may be especially good at pointing out the areas where we have the most room to grow; hence, we should not avoid these conversations. We should be vigilant, though, for relationship issues hiding under coaching. Not everyone who's trying to \"fix\" us has our wellbeing in mind.\n\nIf we're triggered by feedback due to the person delivering it, it is a good idea to look out for relationship systems that may be the cause of our discomfort. We can take three steps back here. First, look at the Us + Them intersection. Are the differences between us creating friction? How are we in the relationship? E.g., does your business partner want you to be really resourceful and not spend one cent too much, whereas you think that you need to invest more courageously, and are you both criticizing each other for your respective lack of initiative and wastefulness? Instead of focusing on what the other person is doing wrong, step back and notice what each one is doing in reaction to the other. Next, we step back further and see if it's our respective roles that cause the friction. Do our roles clash? Are the incentives that come with our roles opposing each other or are our responsibilities overlapping or poorly defined? If any of that is the case, the problem probably doesn't lie with any individual, but rather with how their roles are organized. If we step back even further, we can have a similar look at processes, policies, or the physical environment reinforcing the problem. Are centralized processes too inflexible for local needs? Are different times zones making it difficult to cooperate with your overseas colleagues? Are you being pulled into a conflict with another team because new cooperate policy wants them to kind of replace your team? Again, probably not any individuals fault, but a result of the systems we're all in. Looking at systems helps us to reduce judgement, enhance accountability, and uncover root causes of problems.\n\n### Identity and Growth Mindset\n\nThe last category of triggers is probably the most difficult to accept, because they can attack our very own identity and the way we see ourselves. If people criticize what we do, we may think they are criticizing us as a person, the way we are, they way we live our lives. Different people react very differently to feedback that could be interpreted as attacking their identity. We have different baselines, and the emotional impact (up or down) of feedback as well as how fast we recovery from it, can vary a lot. Our emotions can distort our stories about the feedback itself significantly. According to Stone and Heen, about half of the influence on how we perceive feedback is genetically wired into our predisposition. While some people with a high baseline are hardly bothered by critical feedback, others with a lower baseline may feel that the negative feedback they got now is what they've always been getting and always will be getting for the rest of their future. To mitigate the emotional impact of feedback, we first need to dismantle the distortions we perceive. There are a couple of things we can do about that: We should be prepared and mindful when we're getting feedback and separate the strands. What is the actual statement, and what are our feelings about it and the story we craft from them? Those are different things. We can try to contain the story that forms in our mind by making clear what the feedback actually is about and what it isn't about. If we did bad at one particular task, and we're being told so, it doesn't mean we're horrible humans or that the feedback giver thinks so. But no matter what others think about us, we need to accept that we cannot control the way they see us. And we don't need to fully buy their story about us either - other's views on us are input, not imprint.\n\nThe probably most important strategy for receiving critical feedback well and using it for our personal development is to cultivate a growth identity. How do we tell our own story? Do we think that we're either all-or-nothing good or bad at something? Then we should probably shift to appreciating how complex reality is. If we think that all of our abilities are given, predetermined by our genetics, our upbringing, or sociocultural background, we leave no room for development. When we fail at something that we'd like to be succeeding at, instead of thinking \"oh well, I guess I suck at this and there's nothing I can do about it\", rather shift to \"I might suck at this now, but I can certainly get better at it if I work on it hard\". We must shift to seeing challenges as opportunities and feedback as useful information for learning and growth. When cultivating a growth mindset, try to really accept the coaching you get as such, and try to even get some coaching out of plain evaluation. When being evaluated, try to separate the judgement from the actual assessment and consequences. Also, don't judge yourself only based on the first evaluation you get, but rather on how well you responded to it and how much you managed to grow from that feedback - give yourself a second score for how you handled the first score! I personally do fully agree with Stone and Heen on this point here. The right mindset is the single most important thing for personal and professional development. I can observe it in myself; I can welcome even painful feedback as long is I'm convinced that it'll give me a chance to improve myself, and I've observed the same with my students.\n\n### How good do I have to get?\n\nOkay, so should we all grow infinitely? If there's always more to fix, to improve, if the criticism never ends, we may eventually just burn out. Therefore, setting boundaries on feedback is also critical to healthy relationships and life-long learning. Stone and Heen define three types of boundaries that can be useful to not be overwhelmed by feedback: _Thanks and No_, we're happy to hear you advice, but we may not take it. So please don't complain that we never listen to you; we do, but we consider your advice as only one possible option. Next there is _Not now, not about that_, we need some time and space now and not talk about that sensitive topic. We just got fired and need to have a couple of drinks before we want to listen to why we messed it up again. Lastly, _No Feedback at all_, I know what you'd like to say and I don't want to hear it. That's my decision and you need to respect it. If our relationship continues depends on if you can keep your judgement to yourself or not. When we feel like we have to turn down feedback, we should try to be appreciative and firm, and rather use the word \"and\" instead of \"but\". We don't necessarily want to disagree with what the feedback giver said _and_ we also don't want to discuss it further for now. Let's be specific about our exact request, the time frame, the consequences, and their assessment. Also, if we decide that we won't be changing based on their feedback, we should at least try to reduce the negative impact that we have on them. We need to understand that impact, and then try to problem-solve together, maybe even coach them in turn on how to deal with the unchanged us.\n\n### Conclusion\n\nBased on the work of Douglas Stone and Sheila Heen, we've discussed that feedback is a great opportunity for growth, but can often be hard to receive well. We've seen that three main groups of triggers concerning the truth of the feedback, the relationship to the person giving it, and the impact on or identity, can make it difficult for us to accept the feedback we get. Feedback can be roughly categorized into appreciation, coaching, and evaluation, and comes in vague labels; hence, intentions are often confused and expectations not met. We often have blind spots that we're not aware of and need the help of others to see them. Sometimes the relationship to the feedback giver is problematic because our roles are opposed or processes and policies are causing our conflict. We often have feedback for each other and it is important to not switch-track in one conversation, but give each topic its own track. Receiving feedback can be particularly tricky when we feel that our identity is under attack. To not feel emotionally assaulted, we need to dismantle the distortions that we build around the actual feedback itself, and cultivate a growth mindset that allows us to see feedback as an opportunity to develop ourselves. Lastly, we've also seen that it's important to be able to draw boundaries when we're not able or ready to receive certain feedback. I really enjoyed this book and I'll probably have a look at the authors' other book, [_Difficult Conversations: How to discuss what matters most_](https://www.amazon.com/Difficult-Conversations-Discuss-What-Matters/dp/0670921343) soon. So, thanks to Douglas and Sheila for the book and thank you for reading this post!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("The Doggo Snap Mobile App","Classify dogs' breeds whenever you meet them",new Date("2021-04-19"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/doggo-snap/title.png","What breed is it? Find out with Doggo Snap!",["Data Science & AI/ML","Web Development"],'**TL;DR:** I built a mobile app called Doggo Snap which can be used for classifying a dog\'s breed from an image. It provides you with information on the breed and let\'s you save the dogs you\'ve photographed so that you can have a look at them later and see where you\'ve met them on a map. I created the app using React Native and built it for [Android](https://play.google.com/store/apps/details?id=com.pascalbliem.doggosnap) and iOS (though the iOS version is not available in the app store as of now). For classifying a dog\'s breed with computer vision, the app calls an API which hosts a Deep Learning model (a convolutional neural network) in the cloud. \n \n### Find out what dog breed it is!\n \nA lot of people like dogs - of course they do, humans have been domesticating their "best friends" for aeons, which resulted in a diversity of breeds originating from all over the globe. Certainly, there are some real dog enthusiasts who know every breed and can tell a breed\'s typical size, weight, or temperament. But most people, even if they like dogs, probably only know a handful of breeds by name. So what do you do when you meet a cute dog outside and would like to know what breed it is? Well, you could ask the dog\'s owner, they usually know. Or you download Doggo Snap, take a picture of the dog, and get all the info you wanna have on your new furry friend. With the [Doggo Snap](https://play.google.com/store/apps/details?id=com.pascalbliem.doggosnap) app, you can take a photo or upload one from your gallery and classify it fast. You\'ll be shown the breed name, how certain the algorithm is with its decision, and info on a breeds typical temperament, size, weight, lifespan, and region of origin.\n \nThe app was build in [React Native](http://www.reactnative.com/), an open-source mobile application framework in which you can write an application in JavaScript (as you would in [React](https://reactjs.org/) for web applications), and compile it to true native apps for Android or iOS. For a smoother development workflow, I used the [Expo](https://expo.io/) framework. The app is designed to make people happy who care about privacy; no user data is stored on a server. The only contact the app has to a backend is sending the dog image via a POST request for classification, which is not saved to a backend database. Instead, the dogs that are saved by the user are handled in app state with [Redux](https://redux.js.org/) and persisted to an on-device [SQLite](https://www.sqlite.org/index.html) database. The neural network doing the image classification, which I described in [an earlier post](http://www.pascal-bliem.com/blog/classifying%20dog%20breeds%20with%20deep%20learning), is a slightly modified [MobileNetV2](https://arxiv.org/abs/1801.04381) pretrained on [ImageNet](http://www.image-net.org/) and fine tuned on 121k pictures of 121 dog breeds. The model is being hosted in [Renders\'s](https://www.render.com/) cloud with the help of [FastAPI](https://fastapi.tiangolo.com/). I won\'t go into any code here, as the app\'s code isn\'t really arrangeable in a nice linear fashion and it is just too much stuff to bore you with here. If you want to see details though, please have a look at the [Github repo](https://github.com/Pascal-Bliem/doggo-snap).\n \n### Thinking about functionality, state, and navigation\n \nWhat does an app for dog breed classification need to do and how do we let all components of the application know whats going on? React apps are generally composed of components, which are independent, reusable, and conceptually isolated pieces of the UI. Some of these components higher up in the component hierarchy will be the screens we eventually see in the app. To get around from one screen to another, I use the stack navigator provided by the [React Navigation](https://reactnavigation.org/) library. Individual components can hold state to keep track of whats going on inside of them, and they can also pass down this information as properties (or just props) to their child components. However, if there are many components at different levels of the UI hierarchy that need to access the same state information, this can get messy pretty quickly. It would be much better if there was a single source of truth that is easily accessible from all components. As mentioned above, in this app I use Redux for state management, which will alow us to tap into the app state and also dispatch actions that modify the state, from anywhere within the application. To make sure that the user doesn\'t lose their saved dogs every time they restart the app, the state is loaded from and mirrored to an on-device SQLite database.\n \n![An overview of Doggo Snap\'s functionality](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/doggo-snap/diagramm.png)\n \nWe don\'t have any user data being saved on a backend, so no user creation or authentication is required. Pretty much everything stays on the user\'s device. Starting at the home screen, we want to be able to take a new photo or select an old one from the gallery and classify it. \n\nThe Home Screen \n\nThe classification happens by sending the image via a HTTP request to the machine learning API, getting the classification data back in the response, and displaying it on a classification screen, to which we automatically navigate from the home screen. On the classification screen, the taken image along with the dog\'s breed, and the certainty of the classification will be displayed. In addition some info on the breed is given, such as typical temperament, height, weight, lifespan, and region of origin.\n \nThe Classification Screen \n\nUp to this point, no state is needed. But the user should be able to save her classified dogs as well. This is obviously were state management comes in. When the user choses to save a dog, she navigates to the save screen, where the dog can be given a name. Furthermore the user can specify where she met the dog by either determining the device\'s current location or by picking a location on the map manually. For the map component, I use [React Native Maps](https://github.com/react-native-maps/react-native-maps), which will use Google Maps as a map provider if you\'re on Android or Apple Maps if you\'re on iOS. Besides getting the raw latitude and longitude of the dogs location, I also use the [Nominatim](https://wiki.openstreetmap.org/wiki/Nominatim) reverse geocoding API to generate a human-readable address from the location data. All this data, the image, breed, name, location, and address, is used to create a dog instance which is saved to the local database and appended to the app state\'s list of saved dogs.\n \nThe Save Screen \n\nAfter saving, the user will be navigated to a screen with a list of her saved dogs. The dog entries can be tapped to get to a details screen which is essentially the same (using mostly the same components) as the classification screen, except that it now show\'s the dog\'s name in addition to its breed, as well as a map view showing the dog\'s location. The dog entry can also be deleted form this screen, which would trigger a deletion of the dog instance from the database and the app state.\n \nThe Dog List Screen \n\nTo explore where exactly the user met which saved dog or to explore where the dog-encounter-density is particularly high, she can navigate from the the home screen to the map screen. All the dogs are shown as markers in the form of the dogs\' images at their respective locations. Tapping such a marker will navigate to the same details page as would tapping a dog\'s list entry.\n \nThe Map Screen\n\nFinally, the user may run into the problem that she\'s interested in learning about a lot of dog breeds but does not necessarily encounter all of them in her usual surroundings. No problem! Doggo Snap supports 121 different dog breeds and all the info on these breeds comes bundled with the app. From the home screen, the user can navigate to the Explore-all-Breeds-Screen which displays a list of all supported dog breeds. Tapping on any entry navigates you to the same kind of details screen as the classification screen.\n \nThe All-Breeds Screen \n\n### Conclusion\n\nThat\'s it! A React Native app that let\'s you classify dog breeds from images, save your favorite dog, and revisit them, even on a map! In previous posts, I\'ve already elaborated on how to [build a Deep Learning model](http://www.pascal-bliem.com/blog/classifying%20dog%20breeds%20with%20deep%20learning) in PyTorch for image classification on dog photos, and how to [transfer it to the ONNX format](http://www.pascal-bliem.com/blog/transfer%20ml%20models%20easily%20with%20onnx) (which is how the API uses the model). In this post, I\'ve explained how I make these machine learning capabilities accessible to users on their smartphones. Using React Native, I\'ve coded React in JavaScript that was turned to native applications for Android and iOS. If you\'re interested to look at it in detail, check out the [Github repo](https://github.com/Pascal-Bliem/doggo-snap), or download the app from the [Play Store](https://play.google.com/store/apps/details?id=com.pascalbliem.doggosnap). Thanks a lot for reading!\n ')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Transfer ML Models easily with ONNX","Moving a Deep Learning Model from Python to JavaScript",new Date("2021-03-25"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/onnx/onnx-logo.png","The Open Neural Network Exchange (ONNX)",["Data Science & AI/ML","Web Development"],'**TL;DR:** The Open Neural Network Exchange (ONNX) allows to transfer machine learning models into a common format that can easily be exchanged between different frameworks or runtimes. Here, I will show how to export a deep learning model for classifying dog breeds from PyTorch (Python) into the ONNX format and run inference in NodeJS (a JavaScript runtime).\n\nInsufficient interoperability in machine learning has been a pain point for a long time. There are many different frameworks for machine learning or deep learning that can usually not be used interchangeably. Most of these frameworks, which are used for the development of machine learning models, are written in the Python programming language (or in C/C++ but at least have a Python wrapper). However, the system on which the model should eventually perform inference, might be written in a different programming language. Think about execution in a mobile app or the browser; Python is usually not used there. In the past, this often meant that a model would have to be reimplemented in the target systems.\n\nTo avoid the overhead of reimplementation and promote interoperability between different frameworks, tools, runtimes, and compilers, Microsoft and Facebook started developing the [Open Neural Network Exchange (ONNX)](https://onnx.ai/) in 2017. On their [Github](https://github.com/onnx/onnx), they describe it as follows: "ONNX provides an open source format for AI models, both deep learning and traditional ML. It defines an extensible computation graph model, as well as definitions of built-in operators and standard data types." Since every machine/deep learning model can be represented as a combination of operators, models from any machine learning framework can be transferred into the ONNX format on be executed by the ONNX runtime, which is available in a couple of programming languages. In the following, I will export a model from PyTorch and run inference on it in NodeJS.\n\n### Exporting a model to ONNX\n\nIn a [previous blog post](http://www.pascal-bliem.com/blog/classifying%20dog%20breeds%20with%20deep%20learning), I described how I built a CNN deep learning model to perform image classification on 121 different dog breeds. I developed this model in PyTorch, a Python deep learning framework. PyTorch has some build in capabilities to export its models to ONNX and also offers a [tutorial](https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html) on how to do it. In the previous post, we created the model like this:\n\n```python\nimport torch\nimport torch.nn as nn\nfrom torchvision import models\n\n# get predefined model architecture\nmodel = models.mobilenet_v2(pretrained=True)\n\n# replace the classifier-part of the model that has been pretrained on ImageNet\n# by a new classifier for classifying dog breeds\nnum_clf_in_features = model.classifier[1].in_features\nmodel.classifier = nn.Sequential(\n nn.Dropout(p=0.2, inplace=False),\n nn.Linear(in_features=num_clf_in_features, out_features=len(class_names), bias=True),\n nn.LogSoftmax(dim=1)\n)\n\nmodel.to("cuda")\n```\n\nWe can then load the trained model parameters by:\n\n```python\nmodel.load_state_dict(torch.load(checkpoint_dir))\n```\n\nIn this case it would be relatively straight forward, if we wanted to run the model on the current Python implementation of the ONNX runtime. However, earlier versions of the runtime seem to be lacking certain operators, and it appears that the runtime currently implemented for JavaScript in the library [ONNX.js](https://github.com/microsoft/onnxjs) is lacking some of the operators that our model uses. If we would just export the current model as it is, we would later (when running it in NodeJS) get errors that would look like:\n\n```\nTypeError: cannot resolve operator \'LogSoftmax\' with opsets: ai.onnx v9\n```\n\nand\n\n```\nTypeError: cannot resolve operator \'Shape\' with opsets: ai.onnx v9\n```\n\nFixing the first one is easy: The final LogSoftmax layer in the model doesn\'t have any weights, so we can just replace it with regular Softmax layer without having to retrain the model. A Softmax operator is implemented in ONNX.js at the moment. The only difference is that the model will now output regular probabilities between 0 and 1, instead of log probabilities. Fixing the "Shape" error was a little more tricky. Apparently, ONNX.js currently does not support dynamic shape calculation, but the `torchvision.models.mobilenet_v2` class implements a method `_forward_impl` which has a dynamic shape operation:\n\n```python\ndef _forward_impl(self, x):\n # This exists since TorchScript doesn\'t support inheritance, so the superclass method\n # (this one) needs to have a name other than `forward`that can be accessed in a subclass\n x = self.features(x)\n # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]\n x = nn.functional.adaptive_avg_pool2d(x, 1).reshape(x.shape[0], -1)\n x = self.classifier(x)\n return x\n```\n\nIf we replace this operation with a `flatten` operation (like in [this](https://github.com/pytorch/vision/blob/c991db82abba12e664eeac14c9b643d0f1f1a7df/torchvision/models/mobilenetv2.py#L103) implementation), the method `_forward_impl` will look like this:\n\n```python\ndef _forward_impl(self, x: Tensor) -> Tensor:\n # This exists since TorchScript doesn\'t support inheritance, so the superclass method\n # (this one) needs to have a name other than `forward` that can be accessed in a subclass\n x = self.features(x)\n # Cannot use "squeeze" as batch-size can be 1\n x = nn.functional.adaptive_avg_pool2d(x, (1, 1))\n x = torch.flatten(x, 1)\n x = self.classifier(x)\n return x\n```\n\nSince the `flatten` operator is implemented in ONNX.js, this should work.\n\nNow let\'s export the model to ONNX. We need to define and output path `onnx_model_dir` and some dummy input data, so that the model knows which input shape to expect:\n\n```python\nimport onnx\nimport onnxruntime as ort\n\n# dummy data to define the input shape\n# (batch size 1, 3 channels, height 224, width 224)\ndummy_input = torch.randn(1, 3, 224, 224, device=\'cuda\')\n\ntorch.onnx.export(\n model,\n dummy_input,\n onnx_model_dir,\n verbose=False,\n # input_names= we can name the input and output\n # output_names= layers here, if we want to\n)\n```\n\nWe can check if the export worked by validating the model:\n\n```python\n# load the ONNX model\nonnx_model = onnx.load(onnx_model_dir)\n\n# check that the model is well formed\nonnx.checker.check_model(onnx_model)\n\n# print a representation of the graph (very long)\n# print(onnx.helper.printable_graph(onnx_model.graph))\n```\n\nIf everything works fine until here, we can now try to do some inference with the ONNX runtime in Python. The `val_loader` is the validation data loader (see [previous blog post](http://www.pascal-bliem.com/blog/classifying%20dog%20breeds%20with%20deep%20learning)).\n\n```python\n# create an inference session based on the exported model\nort_session = ort.InferenceSession(onnx_model_dir)\n\n# get a batch from the data loader\nbatch, labels = next(iter(val_loader))\n\n# run a prediction on the first image in the batch\n# (note that, since I didn\'t name the input or output\n# layers when exporting the model, the first input\n# layer was automatically named "input.1")\noutputs = ort_session.run(None, {"input.1": batch.detach().numpy()[:1]})\n\n# print the output\noutputs\n```\n\n```\n[array([[2.27155489e-10, 6.50463699e-06, 1.23810617e-09, 9.81572441e-08,\n1.22788846e-08, 2.33644937e-06, 1.34210339e-07, 19075965e-10,\n\n# [...] I cut out most entries for readability\n\n3.75615491e-05, 2.95915470e-09, 7.11135826e-08, 79975457e-09,\n6.07401551e-09]], dtype=float32)]\n```\n\nWe get back a list containing an array with predicted probabilities for each of the 121 dog breeds - everything seems to work well.\n\n### Running inference in JavaScript\n\nNow the real magic happens when we change from one programming language to another. Since I am planning to serve my dog classifier in a mobile app, which I\'d like to write in [React Native](https://reactnative.dev/), I want to try out running it in JavaScript. Specifically, I will use the [NodeJS](https://nodejs.org/en/) runtime. Besides the JavaScript libraries [ONNX.js](https://www.npmjs.com/package/onnxjs) and [onnxjs-node](https://www.npmjs.com/package/onnxjs-node), we\'ll also need [NumJs](https://www.npmjs.com/package/numjs) (a JavaScript equivalent of NumPy) for processing image data. Just like I did when training the model in PyTorch, I need to normalize the image data based on the mean and standard deviation I calculated for each channel.\n\n```javascript\n// import libraries\nrequire("onnxjs");\nrequire("onnxjs-node");\nconst nj = require("numjs");\n\n// import a mapping from class numbers to class names\nconst classMap = require("./class_mapping");\n\nconst IMAGE_SIZE = 224;\n\n// load an image and resize to IMAGE_SIZE\nlet img = nj.images.read("./sample_images/1.jpg");\nimg = nj.images.resize(img, IMAGE_SIZE, IMAGE_SIZE);\n\n// a function that performs per-channel normalization of each pixel\nfunction normalizePerChannel(img, channel, mean, std) {\n // extract all pixels for the given channel\n img = img\n .slice(null, null, [channel, channel + 1])\n .reshape(IMAGE_SIZE * IMAGE_SIZE);\n\n // make sure under-laying data type is float32\n img.selection.data = new Float32Array(img.selection.data);\n\n // normalize pixel values\n img = img.divide(255.0).add(-mean).divide(std);\n\n return img.tolist();\n}\n\n// normalize each RGB channel\nconst channelR = normalizePerChannel(img, 0, 0.512, 0.267);\nconst channelG = normalizePerChannel(img, 1, 0.489, 0.263);\nconst channelB = normalizePerChannel(img, 2, 0.422, 0.271);\n\n// combine all channels into one array\nconst imgData = [...channelR, ...channelG, ...channelB];\n```\n\nNow that the image data is prepared, we\'ll perform inference on it:\n\n```javascript\n// open an ONNX inference session with CPU backend\nconst session = new onnx.InferenceSession({ backendHint: "cpu" });\n\nsession.loadModel("./ml_models/dog_classifier.onnx").then(() => {\n // build input data tensor by providing the image data,\n // data type, and input dimensions\n const inferenceInputs = [\n new onnx.Tensor(imgData, "float32", [1, 3, 224, 224]),\n ];\n\n // run inference on the input tensor\n session.run(inferenceInputs).then((output) => {\n const outputTensor = output.values().next().value;\n // get index of highest probability prediction\n const pred = outputTensor.data.indexOf(Math.max(...outputTensor.data));\n // log prediction to console\n console.log(`Prediction: ${classMap[pred]}.`);\n });\n});\n```\n\n```\nPrediction: golden_retriever\n```\n\nGreat! The model still works and classifies dog breeds correctly, without having to reimplement any part of it in JavaScript. This greatly simplifies the usage of machine learning models across different programming languages and devices.\n\n### Conclusion\n\nWe have learned about the Open Neural Network Exchange (ONNX), which allows to transfer machine learning models into a common format that can easily be exchanged between different frameworks or runtimes. We have seen how to export a model that has been developed in Python in the PyTorch framework into the ONNX format and the use this common format to perform inference with this model in a JavaScript runtime. In the future, I\'m planning to use this model and the ONNX.js runtime to build a mobile app for classifying dog breeds on photos. Stay tuned for a blog post on that topic. Thanks for reading!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Classifying Dog Breeds with Deep Learning","Specialized CNN architectures, transfer learning, and lots of data",new Date("2021-02-24"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/dog-classifier/dogbreeds.png","Which breed is this?",["Data Science & AI/ML"],"**TL;DR:** I scraped a large data set containing about 121k images of 121 different dog breeds to train a CNN deep learning classification model with PyTorch. Starting from a model pretrained on ImageNet, the model performs at about 89% accuracy on validation data, with relatively simple additional training.\n\nI haven't build any cool deep learning app in quite a while, and I felt it's about time. I already gathered some experience in the field of natural language processing with my [ToxBlock](http://www.pascal-bliem.com/blog/tox%20block%20using%20ai%20to%20keep%20discussions%20clean) app, so now I'd like to do something in the field of computer vision. Quite some time ago I though about building an app for [Chinese character OCR](http://www.pascal-bliem.com/blog/a%20data%20set%20of%20handwritten%20chinese%20characters), but had to give up the idea when I found out that training a model that could recognize thousands of different characters was a bit to much for the resources available to me. I still like the idea of building an app that people can have on their mobile phones and use for classifying stuff they come across. If not Chinese characters, what else would be a good target for image classification? I asked a few people what they would like to take pictures of when walking around outside, and \"definitely DOGS!\" was among the replies I got. And I thought that was a great idea; I see tons of cute dogs when taking a walk in the park, but I only know like 5 dog breeds by name. Having an app on my phone that tells me which breed it is would be amazing! In this post I'll talk about how I obtained a data set and how the actual neural network that does the classification is build and trained. Anything related to mobile development won't be in this post, but I'll probably cover it in a later post.\n\n### Getting the data\n\nOkay, so first thing in every machine learning project is data. I'm probably not the first person thinking about classifying dog breeds, so I'd assume there are some data sets available online. And, in fact, there is a pretty famous one called [Stanford Dogs](http://vision.stanford.edu/aditya86/ImageNetDogs/) (from Stanford University). It contains about 150 pictures for each of 120 different dog breeds. This data set even got its own [Kaggle competition](https://www.kaggle.com/c/dog-breed-identification) and is a part of [ImageNet](http://www.image-net.org/), a large image classification data set. A similar data set is the [Tsinghua Dogs](https://cg.cs.tsinghua.edu.cn/ThuDogs/) (by Tsinghua University), which contains images of the same 120 breeds plus 10 new classes. The number of images is in proportion to their frequency of occurrence in China, some classes are still quite underrepresented.\n\nUsing the Stanford Dogs data set as a starting point is great for several reasons: As I mentioned earlier, I don't know a lot of dog breeds by name, and here, someone has preselected 120 breeds that I can use as well. Furthermore, the fact that these 120 breeds are also a part of the ImageNet data set makes it ideal for transfer learning. Many models that have been trained on ImageNet are available online and can be used as a starting point for further, more specialized training. However, only using the two aforementioned data sets won't be sufficient. They still don't contain enough training examples to distinguish all breeds reliably and they're also not licensed for commercial use. But hey, people love their dogs and they love to take photos of them and put them on the internet under a public licence. And a really convenient way to get these images is over an image search API, such as the one from [Bing](https://www.microsoft.com/en-us/bing/apis/bing-image-search-api). I will use the 120 dog breed names plus [Shiba Inu](https://en.wikipedia.org/wiki/Shiba_Inu) (I really like this breed but it wasn't in the Stanford data set) as search terms for the image search API calls and try to get about 1000 images per breed.\n\nIn case you'd like to do something like that and use the Bing image search API, here's a code snippet that you could use for downloading all the image data. Keep in mind that you'd have to sign up for [Microsoft Azure](https://azure.microsoft.com/) to get an API key to use with the API. As of now, I got some free credit when signing up which I used for calling the API, so I didn't actually spend any of my own money. Here, `search_terms` is a Python list of search terms (dog breed names) to look up images for, `directory_names` is a list of directories into which I'll save the Images, and `subscription_key` is the API key.\n\n```python\nimport requests\nfrom PIL import Image\nfrom IPython.display import clear_output\n\nsearch_url = \"https://api.bing.microsoft.com/v7.0/images/search\"\nheaders = {\"Ocp-Apim-Subscription-Key\" : subscription_key}\n\n# loop through all dog breeds\nfor count, search_term, directory_name in zip(\n range(len(search_terms)),\n search_terms,\n directory_names\n):\n # print progress\n print(f\"Entering {directory_name} {count+1}/{len(search_terms)}\")\n\n # create directory for dog breed if it doesn't exist yet\n directory_path = os.path.join(\"data\", \"bing_image_api\", \"images\", directory_name)\n if not os.path.exists(directory_path):\n os.makedirs(directory_path)\n\n # get 50 images at a time\n for offset in range(0, 1001, 50):\n\n # log progress\n clear_output(wait=True)\n print(f\"Entering Offset {offset}\")\n\n # Set up params for the request to the Bing Image Search Api;\n # NOTE that you can set the license type to public and the\n # image type to photo - this is great because it will mostly\n # prevent you from getting drawings or clip art of dogs!\n params = {\n 'q': search_term,\n 'offset': offset,\n 'count': 50,\n 'license': \"public\",\n 'imageType': \"photo\"\n }\n\n # make the request to the API\n response = requests.get(search_url, headers=headers, params=params)\n response.raise_for_status()\n search_results = response.json()\n\n # for each search result that came back from the API call\n for i, result in enumerate(search_results['value']):\n\n # get each image by URL and write to file\n image_url = result['contentUrl']\n image_path = os.path.join(directory_path, f\"{offset+i}.jpg\")\n # open a file handle here to write the image to\n with open(image_path, \"wb\") as handle:\n\n try:\n # log progress\n clear_output(wait=True)\n print(f\"Getting {directory_name} {count+1}/{len(search_terms)} image {offset+i}\")\n\n # send request to get the image\n response = requests.get(image_url, timeout=30)\n\n # skip and remove the image file if something's wrong\n if not response.ok:\n os.remove(image_path)\n continue\n\n\n clear_output(wait=True)\n print(f\"Writing {directory_name} {count+1}/{len(search_terms)} image {offset+i}\")\n\n # write to file in blocks until response is empty\n for block in response.iter_content(1024):\n if not block:\n break\n handle.write(block)\n\n # if anything goes wrong, delete the image file and move on\n except:\n os.remove(image_path)\n continue\n\n # validate if the image is uncorrupted; if not, delete it\n try:\n im = Image.open(image_path)\n im.verify()\n except Exception as e:\n os.remove(image_path)\n # also make sure the image is not an animated GIF\n try:\n im.seek(1)\n except EOFError:\n is_animated = False\n else:\n is_animated = True\n if is_animated:\n os.remove(image_path)\n```\n\nNow you should have thousands of dog images available to train on. They may, of course come in all sorts of sizes, mostly larger than you would need them for your neural network anyway, so you may want to resize them already at this point. I found this useful for me because I wanted to train the model on a Cloud service that offers GPU infrastructure, namely [Google Colab](https://colab.research.google.com/), and it goes much faster to upload and unpack the images when they're already down-sized. You could resize all the images like this:\n\n```python\n# the size the image will be resized to\nNEW_SIZE = 256\n\n# loop through all image directories\nfor count, directory_name in enumerate(directory_names):\n source_path = os.path.join(\"data\", \"bing_image_api\", \"images\", directory_name)\n target_path = os.path.join(\"data\", \"bing_image_api\", \"small_images\", directory_name)\n\n # create target directory if it doesn't exist yet\n if not os.path.exists(target_path):\n os.makedirs(target_path)\n\n # loop through each file in the directory\n file_names = os.listdir(source_path)\n for i, file_name in enumerate(file_names):\n\n # log progress\n clear_output(wait=True)\n print(f\"Moving {directory_name} {count+1}/{len(directory_names)} image {i+1}/{len(file_names)}\")\n\n # load the image\n im = Image.open(os.path.join(source_path, file_name))\n\n # resize the image\n im = im.resize([NEW_SIZE, NEW_SIZE])\n\n # save the resized image to target directory\n im.save(os.path.join(target_path, file_name))\n```\n\nNow we should have the data in a neat format for zipping it up and loading it to where ever we have some GPUs available for training a model.\n\n### Building the model\n\nWhen it comes to computer vision tasks, one usually doesn't have to start dreaming up models from scratch. Pretty much all networks that work well with image data are convolutional neural networks (CNNs). Among those, there are many known model architectures that have achieved great results on classification task such as ImageNet. For a while, the way to go to improve accuracy was to make the models deeper and deeper, but that also means they have more parameters. On the one hand, this means it takes longer to train the network, on the other hand, it will also make inference slower, which can be critical on edge devices that have relatively little computing power. Considering that I'd like to run the model inference within a mobile app, it would be desireable to keep the number of parameters as small as possible.\n\nLuckily, some very smart people came up with some very smart ideas to reduce the amount of parameters while keeping a high model performance, such as in the [ShuffleNet](https://arxiv.org/abs/1707.01083), [SqueezeNet](https://arxiv.org/abs/1602.07360), or [MobileNetV2](https://arxiv.org/abs/1801.04381). All these models are also freely available online (in deep learning frameworks such as [PyTorch](https://pytorch.org/) or [TensorFlow](https://www.tensorflow.org/)), pretrained on ImageNet. I decided to go for MobileNetV2 as a starting point for my model. This architecture applies some cool tricks (which you can read about in the [paper](https://arxiv.org/abs/1801.04381)) such as inverted residuals and linear bottle necks, but what really shrinks down the number of parameters is using depth-wise separable convolutions. Basically, these convolutions treat every channel of the image separately and then combine the results with point-wise convolutions. It's not trivial to explain how exactly that works, so I won't do it here, but [this blog post](https://towardsdatascience.com/a-basic-introduction-to-separable-convolutions-b99ec3102728) does a great job at it.\n\nFor this Project I will use the PyTorch deep learning framework. I will now walk you through the process of building the model:\n\n```python\n# import the libraries we'll need\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom torch.optim import lr_scheduler\nimport torchvision\nfrom torchvision import datasets, models, transforms\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom PIL import Image\nfrom IPython.display import clear_output\nimport time\nimport os\nimport math\nfrom copy import copy, deepcopy\n```\n\nNow let's define some config variables that we'll need throughout the rest of the code.\n\n```python\n# Use a GPU if it's available - if not, use CPU\ndevice = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n\n# path to the directory the image data resides in\ndata_dir = \"/content/images/small_images\"\n\n# path to where the model checkpoints are saved\ncheckpoint_dir = \"/content/gdrive/MyDrive/dog_classifier_checkpoint.p\"\n\n# fraction of the data that will be used for training (rest is for validation)\ntrain_frac = 0.8\n\n# batch size for training the model\nbatch_size = 64\n\n# for evaluating accuracy on only high-probability predictions\ncutoff_prob = 0.7\n```\n\nBefore working on the actual model, we should prepare the data set for both training and validation. Since we're working on image data, it is easy to use some data augmentation by applying some random transformations to the images. In this case, we'll randomly crop out a 224x224 patch out of the (previously 256x256) images, maybe flip it horizontally and apply some small rotation to it. This way, the model will never see the exact same image twice, which should help to make generalize better. Note that these transformations are applied to the training data only; for the validation data, we just crop out a center patch in the right size. For both parts of the data set, we'll also normalize the data on a per-channel basis. We'll subtract the mean value of each channel from each pixel and divide by the standard deviation (which I've calculated for this data set beforehand). Centering the data around zero usually helps with convergence and hence, speeds up training.\n\n```python\n# here we got the transforms and normalization I talked about\ndata_transforms = {\n 'train': transforms.Compose([\n transforms.RandomCrop(224),\n transforms.RandomHorizontalFlip(),\n transforms.RandomRotation(5),\n transforms.ToTensor(),\n transforms.Normalize([0.512, 0.489, 0.422], [0.267, 0.263, 0.271])\n ]),\n 'val': transforms.Compose([\n transforms.CenterCrop(224),\n transforms.ToTensor(),\n transforms.Normalize([0.512, 0.489, 0.422], [0.267, 0.263, 0.271])\n ]),\n}\n\n# load the whole data set from the folders that contain\n# the images of each dog breed\nwhole_dataset = datasets.ImageFolder(data_dir)\nclass_names = whole_dataset.classes\n\n# calculate train and validation set sizes\ntrain_size = math.floor(len(whole_dataset) * train_frac)\nval_size = len(whole_dataset) - train_size\n\ndataset_sizes = {'train': train_size, 'val': val_size}\n\n# do a train-validation-split\ntrain_set, val_set = torch.utils.data.random_split(whole_dataset, (train_size, val_size))\ntrain_set.dataset = copy(whole_dataset)\n\n# apply the data transforms to each data set\ntrain_set.dataset.transform = data_transforms['train']\nval_set.dataset.transform = data_transforms['val']\n\n# create data loaders from the data sets\ntrain_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4)\nval_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, num_workers=4)\n\ndataloaders = {'train': train_loader, 'val': val_loader}\n```\n\nTo ensure that what we coded above makes sense, let's have a look at a batch of images that'll be used for training now:\n\n```python\ndef imshow(inp):\n # transpose tensor to match matplotlib's image format\n inp = inp.numpy().transpose((1, 2, 0))\n\n # reverse the data normalization\n mean = np.array([0.485, 0.456, 0.406])\n std = np.array([0.229, 0.224, 0.225])\n inp = std * inp + mean\n inp = np.clip(inp, 0, 1)\n\n # plot the image\n fig, ax = plt.subplots(figsize=(15, 15))\n ax.imshow(inp, interpolation=\"nearest\")\n plt.pause(0.001)\n\n\n# get a batch of training data\ninputs, classes = next(iter(train_loader))\n\n# make an image grid from batch\nout = torchvision.utils.make_grid(inputs)\n\n# plot\nimshow(out)\n```\n\n![A training batch of dog images.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/dog-classifier/doggrid.png)\n\nThis looks like everything works well with the data preparation. Now that we have the data sets prepared, we can set up the model. This step is actually really simple as we'll just slightly modify the MobileNetV2. Let's first get the pretrained model, which is readily available in PyTorch.\n\n```python\nmodel = models.mobilenet_v2(pretrained=True)\nmodel\n```\n\n```\nMobileNetV2(\n (features): Sequential(\n (0): ConvBNReLU(\n (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n (2): ReLU6(inplace=True)\n )\n (1): InvertedResidual(\n (conv): Sequential(\n (0): ConvBNReLU(\n (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)\n (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n (2): ReLU6(inplace=True)\n )\n (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)\n (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n )\n )\n # [...] Here would be 17 more repeating layers of InvertedResiduals,\n # but I left them out for a better readability of this post\n )\n\n (classifier): Sequential(\n (0): Dropout(p=0.2, inplace=False)\n (1): Linear(in_features=1280, out_features=1000, bias=True)\n )\n)\n```\n\nNow if we have a look at all the layers in the model, we can see that the last part is a classifier consisting of a Linear/Dense layer with 1000 output features. This is because there are 1000 classes in the ImageNet data set on which this model has been pretrained. We do, however, only have 121 dog breeds to classify, so we have to adjust the classifier of this model to suit our problem. In addition, we'll add a LogSoftmax layer to transform the output of the model to log probabilities. Using log probabilities rather than regular probabilities between 0 and 1 does generally lead to easier training of the model as log probabilities penalize larger errors more, lead to less arithmetic operations, and have better numerical stability (because they avoid very small probabilities close to 0). We also move the model to the `device`, meaning the GPU, if one is available.\n\n```python\n# replace the classifier-part of the model that has been pretrained on ImageNet\n# by a new, untrained classifier which we will then train to classify dog breeds\nnum_clf_in_features = model.classifier[1].in_features\nmodel.classifier = nn.Sequential(\n nn.Dropout(p=0.2, inplace=False),\n nn.Linear(in_features=num_clf_in_features, out_features=len(class_names), bias=True),\n nn.LogSoftmax(dim=1)\n)\n\n# move model to device\nmodel = model.to(device)\n```\n\n### Training the model\n\nNow that we have the model ready, what do we need to train it? We need to define a loss function (in PyTorch usually called `criterion`), which is a differentiable function that states how far off our predictions are. The choice of loss function depends on what the model outputs: if the last layer of the model was a Softmax function that outputs probabilities (between 0 and 1), we'd probably be using a cross-entropy loss. But since we are using a LogSoftmax that outputs log probabilities, we'll use a negative log likelihood loss (`NLLLoss`) instead. An optimizer will, starting from this loss function, compute the gradients of all parameters and perform the gradient decent to reduce the loss in the next iteration. How far down the gradient the optimizer goes at each step is (initially) determined by the learning rate. If the learning rate is too small, the model will take very long to train - if it's too high, the optimizer may miss a loss minimum. Hence, it makes sense to start with a higher learning rate and reduce it when we realize that the loss is no longer decreasing. In PyTorch, the learning rate can be manipulated with a variety of learning rate schedulers; the one we use here, `ReduceLROnPlateau`, will reduce the learning rate if the validation loss doesn't decrease anymore. In terms of optimizers, there, as well, is a large palette of algorithms to choose from (you can look them up [here](https://pytorch.org/docs/stable/optim.html)). What worked well for me is starting with [Adam](https://arxiv.org/abs/1412.6980), an optimizer that adapts the learning rate throughout training. However, when training with Adam started showing diminishing returns, I switched to using [stochastic gradient decent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) (`SGD`) with momentum and was able to improve the model performance a little further.\n\n```python\n# the loss function\ncriterion = nn.NLLLoss()\n\n# the optimizer (I started with Adam and went on with SGD)\noptimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)\n# optimizer = optim.Adam(model.parameters(), lr=0.001)\n\n# the learning rate scheduler\nrrp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, threshold=0.001, verbose=True)\n```\n\nEverything is set up now to enter the training loop. Within this (fairly large) loop, both model training and validation is going to happen. Note that this is all squeezed into one block of code because this is written for a blog post; if this was a production environment, I'd highly recommend splitting it up into several functions, maybe even separate files. Anyways, let's define the training loop function:\n\n```python\ndef train_model(model, criterion, optimizer, scheduler, num_epochs=25, only_classifier=False):\n since = time.time()\n\n # storing current best model weights and corresponding accuracy\n best_model_wts = deepcopy(model.state_dict())\n best_acc = 0.0\n\n # track validation score over the epochs\n val_losses = []\n val_accs = []\n prev_epochs_printout = \"\"\n\n for epoch in range(num_epochs):\n # log progress\n print(f\"Epoch {epoch}/{num_epochs - 1}\")\n print(\"-\" * 10)\n\n # each epoch has a training and validation phase\n for phase in [\"train\", \"val\"]:\n if phase == \"train\":\n model.train() # set model to training mode\n else:\n model.eval() # set model to evaluate mode\n\n # keep track of metric during epoch\n running_loss = 0.0\n running_corrects = 0\n\n # iterate over the data\n for i, (inputs, labels) in enumerate(dataloaders[phase]):\n inputs = inputs.to(device)\n labels = labels.to(device)\n\n # zero the parameter gradients\n optimizer.zero_grad()\n\n # forward\n # track history only if in train\n with torch.set_grad_enabled(phase == 'train'):\n outputs = model(inputs)\n _, preds = torch.max(outputs, 1)\n loss = criterion(outputs, labels)\n\n # if only the classifier should be trained, disable\n # all gradients and only enable them for classifier\n if only_classifier:\n for param in model.parameters():\n param.requires_grad = False\n for param in model.classifier.parameters():\n param.requires_grad = True\n else:\n for param in model.parameters():\n param.requires_grad = True\n\n # backward + optimize only if in training phase\n if phase == \"train\":\n loss.backward()\n optimizer.step()\n\n # update metrics\n running_loss += loss.item() * inputs.size(0)\n running_corrects += torch.sum(preds == labels.data)\n\n # print out status\n epoch_loss = running_loss / ((i+1)*batch_size)\n epoch_acc = running_corrects.double() / ((i+1)*batch_size)\n\n clear_output(wait=True)\n print(f\"{prev_epochs_printout}\nEpoch {epoch}/{num_epochs - 1} Progress {(i+1)*batch_size}/{dataset_sizes[phase]} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}\")\n\n # calculate metric for the whole current epoch\n epoch_loss = running_loss / dataset_sizes[phase]\n epoch_acc = (running_corrects.double() / dataset_sizes[phase]).item()\n\n # if in validation phase, step the learning rate scheduler\n if (phase == \"val\") and (scheduler != None):\n scheduler.step(epoch_loss)\n\n # update validation metrics\n if phase == \"val\":\n val_losses.append(round(epoch_loss, 4))\n val_accs.append(round(epoch_acc, 4))\n prev_epochs_printout += f\"Epoch {epoch}/{num_epochs - 1} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}\n\"\n\n # print out status\n print(f\"{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}\")\n\n # if the model performed better than at any previous stage,\n # set new best and save the model weights\n if (phase == \"val\") and (epoch_acc > best_acc):\n best_acc = epoch_acc\n best_model_wts = deepcopy(model.state_dict())\n torch.save(best_model_wts, checkpoint_dir)\n\n print()\n\n # print out final status\n time_elapsed = time.time() - since\n print(f\"Training complete in {(time_elapsed // 60):.0f}m {(time_elapsed % 60):.0f}s\")\n print(f\"Best val Acc: {best_acc:4f}\")\n\n # restore best model weights\n model.load_state_dict(best_model_wts)\n\n return model\n```\n\nNow we can train the model in two steps. As mentioned above, the current model is already pretrained on the ImageNet data set, meaning all its convolutional layers have probably already learned features that may be relevant to our image data as well. However, we replaced the last section of the model, the classifier. I will attempt to train only the classifier part and leaving the rest of the model as it is. By doing this, I hope to leverage the image-related features that the pretrained model has already learning and not \"sending a shock\" through the whole model, which may mess up the parameters of the convolutional layers. Once the model performance doesn't improve anymore while only training the classifier, I will make all parameters trainable and resume training the entire model.\n\n```python\n# train classifier only\nmodel = train_model(\n model, criterion, optimizer,\n scheduler=rrp_lr_scheduler,\n num_epochs=5,\n only_classifier=True\n)\n\n# train the whole model\nmodel = train_model(\n model, criterion, optimizer,\n scheduler=rrp_lr_scheduler,\n num_epochs=25,\n only_classifier=False\n)\n\n```\n\nThe (truncated) output looks something like this:\n\n```\nEpoch 0/9 Loss: 0.8426 Acc: 0.7649\nEpoch 1/9 Loss: 0.7726 Acc: 0.7850\nEpoch 2/9 Loss: 0.7064 Acc: 0.8017\nEpoch 3/9 Loss: 0.7089 Acc: 0.8013\nEpoch 4/9 Loss: 0.6813 Acc: 0.8113\nEpoch 5/9 Loss: 0.6557 Acc: 0.8207\nEpoch 6/9 Loss: 0.6662 Acc: 0.8211\nEpoch 7/9 Loss: 0.6543 Acc: 0.8234\nEpoch 8/9 Loss: 0.6406 Acc: 0.8275\n# [...] Cut out some lines here for readability\nEpoch 24/24 Progress 31552/31521 Loss: 0.4532 Acc: 0.8901\nval Loss: 0.4532 Acc: 0.8901\n\nTraining complete in 97m 55s\nBest val Acc: 0.8901\n```\n\nAfter many epochs and playing around with optimizers, learning rates, and data augmentation, the dog breed classifier seems to perform decently. We'll go into more detailed evaluation in the next section.\n\n### Evaluating the model\n\nWhat I want to do here is to run the validation set (on which the model has not been trained) through prediction again and score not only its absolute accuracy, but also consider its performance when only high-probability-predictions are taken into account. The motivation for this is as follows: if someone takes a really crappy picture of a dog, one can't really expect a model to classify it correctly. I didn't have the time to go through all the dog images I scraped manually to see if they look like proper pictures of that dog breed. I noticed that e.g. among the pug images, there were a lot of mixed breeds that presumably contained pug DNA, but didn't really look like a pug. Also pictures taken from weird perspectives or with many different dogs in them cannot be expected to be classified correctly. If this model is served in a mobile app, that wouldn't really be a problem; if the certainty of a prediction is below a certain percentage, say 70%, one could prompt the user to take a better picture of the dog. Therefore, I'll also check how the model performs if only predictions with a certainty over 70% (`cutoff_prob`) are considered.\n\n```python\n# deactivate gradient tracking in evaluation mode\nwith torch.no_grad():\n model.eval()\n\n # keep track of metrics\n running_corrects = 0\n running_high_prob_corrects = 0\n total_high_prob = 0\n\n all_preds = []\n all_labels = []\n\n # iterate over validation data\n for i, (inputs, labels) in enumerate(val_loader):\n inputs = inputs.to(device)\n labels = labels.to(device)\n\n # run inputs through model\n outputs = model(inputs)\n\n # make predictions, regular and for high certainty\n max_vals, preds = torch.max(outputs, 1)\n high_prob_mask = (max_vals >= cutoff_prob)\n high_prob_preds = preds[high_prob_mask]\n high_prob_labels = labels[high_prob_mask]\n\n all_preds += preds.tolist()\n all_labels += labels.tolist()\n\n # update metrics\n running_corrects += torch.sum(preds == labels.data)\n running_high_prob_corrects += torch.sum(high_prob_preds == high_prob_labels.data)\n total_high_prob += len(high_prob_preds)\n\n# print out result\nprint(f\"Acc: {running_corrects/dataset_sizes['val']:.4f}\nHigh Prob Acc: {running_high_prob_corrects/total_high_prob:.4f}\")\n```\n\n```\nAcc: 0.8901\nHigh Prob Acc: 0.9535\n```\n\nWe can see that if we only consider predictions with a high certainty, the model performs really well at 95% accuracy. We have to keep in mind though that this doesn't mean that the model recognizes all dog breeds with such a high accuracy. There are significant differences between the metrics for individual dog breeds, as can bee seen by looking at the classification report:\n\n```python\nfrom sklearn.metrics import classification_report\ncp = classification_report(\n all_labels,\n all_preds,\n labels=list(range(len(class_names))),\n target_names=class_names\n)\nprint(cp)\n```\n\n```\n precision recall f1-score support\n\n affenpinscher 0.95 0.97 0.96 204\n afghan_hound 0.96 0.95 0.95 234\n african_hunting_dog 0.98 0.96 0.97 225\n airedale 0.95 0.97 0.96 201\namerican_staffordshire_terrier 0.78 0.67 0.72 211\n\n # [...] I cut out most lines here for readability\n\n whippet 0.85 0.87 0.86 234\n wire_haired_fox_terrier 0.92 0.94 0.93 218\n yorkshire_terrier 0.90 0.79 0.84 240\n\n accuracy 0.89 31521\n macro avg 0.89 0.89 0.89 31521\n weighted avg 0.89 0.89 0.89 31521\n```\n\nAs we can see, the performance differs quite a lot among the different breeds. Some show an F1 score as high as 98%, others as low as 70%.\nWe can have a look at which dog breeds are often mixed up by looking at the confusion matrix.\n\n```python\nfrom sklearn.metrics import confusion_matrix\ncm = confusion_matrix(all_labels, all_preds)\n# set all the correctly classified entries to zero\n# because we're not interested in them\nnp.fill_diagonal(cm, 0)\ndf_cm = pd.DataFrame(cm, index=class_names, columns=class_names)\n```\n\nI'll save you looking at the entire confusion matrix in detail here, but as an example we can pick out the breeds that have been mixed up particularly often. It's not very surprising that it's breeds that look very similar and are even closely related to each other.\nWithin the validation data, the breed that was most often misclassified (80 times) as one particular other breed, was the Pembroke Welsh Corgi, which has been mistaken for the Cardigan Welsh Corgi. As you can see in the picture below, these two really do look pretty similar:\n\n![The Cardigan Welsh Corgi and Pembroke Welsh Corgi in comparison.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/dog-classifier/corgicomparison.jpg)\n\nOverall, I got the impression that the model is performing pretty decently at classifying breeds and only fails often on images that do either not capture the dog properly or for dog breeds that are very similar to each other. Even I as a human would have some trouble telling a bunch of Corgis apart.\n\n### Conclusion\n\nIn conclusion, we've seen how to build a decently performing dog breed classifier by getting a lot of image data via an image search and using it to fine tune a pretrained convolutional neural network. Via a free Azure account and the Bing image search API, it was fairly easy to get more than a hundred thousand photos of he 121 different dog breeds. Since most of the dog breeds were already a part of the ImageNet data set, it was ideal to start from a model that has been pretrained on ImageNet. The chosen model architecture was MobileNetV2, which is designed for performing well while using relatively few parameters so that it is feasible to run inference on mobile devices. The final model reaches an overall validation accuracy of about 89% (up to 95% when only predictions with more than 70% certainty are considered), but the performance varies quite a bit among the different breeds. I'll probably try to incorporate this model in a mobile app soon, so look out for an upcoming post on that. Thanks a lot for reading!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Keep your head up","Just a personal story of navigating tough times",new Date("2021-01-20"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/keep-your-head-up/brain.jpg","Meningoencephalitis isn't fun",["Non-Tech"],"I haven't been writing much lately, nor have I been very productive in general over the last two months. That is because I've had some pretty challenging weeks due to being seriously sick. This is absolutely nothing technical, just a personal story of mine, and I'm writing this mostly for myself to reflect on the past few weeks. For those who are going or have gone through a similar experience, it may be an interesting read though, I don't know.\n\n### Suspiciously OK\n\nThe year 2020 has been a huge challenge for the whole world. Due to the Covid-19 pandemic, many people lost their health, life, or loved ones. The economy of most nations took a strong hit and many people lost their jobs or had to watch their businesses go bankrupt. Lock-downs forced people to pause most of their beloved social activities and resulted in isolation and loneliness. Until the last quarter of 2020, that wasn't my personal year though. Just a month before the virus hit Germany, I had started a great new job, moved back to my hometown after more than 20 years, and met awesome new people. As a Data Scientist, I had no problem working from home (I got even more productive) and I was used to talking to many of my close friends only via video-call anyways. I felt that both my professional and personal lives were going well...maybe a bit too well, considering that the world around me was in crisis-mode.\n\n### Getting sick\n\nIn the end, it wasn't Covid that got me, but most likely a tick bite. The exact pathogen couldn't be found, but symptoms and presence of corresponding anti-bodies point towards a tick-bourne encephalitis; a brain inflammation caused by a virus transmitted via ticks, native to where I went on vacation just a few weeks earlier. One usual morning, I just finished my daily meditation and went back to bed to read a bit, I suddenly felt a headache. At first, I didn't think much about it but the pain intensified very quickly. Half an hour later I was in strong pain, one of my arms feeling numb, and my sense of balance wasn't working properly anymore. I tried to get through the bathroom door but ran straight into the door frame. My girlfriend, being worried about me asked what was going on, but I couldn't answer properly anymore. The words just wouldn't come out. I tried to send an email or text to my boss to tell him that I'm wasn't coming to work today, but I couldn't type or write anymore either.\n\nThis was the start of one of the most horrible days I've ever experienced. I also experienced that the German healthcare system, which loves to praise itself for how incredibly amazing it allegedly is, really doesn't give a shit about you if you're a patient with _\"only\"_ compulsory governmental health insurance. Despite my symptoms matching a stroke, they wouldn't send an ambulance to get me so my girlfriend and flatmate had to carry me into a public bus and to the nearest hospital. I don't remember all the details of that day anymore; it felt like a long stream of pain and fear. The headaches were getting worse by the minute and I couldn't talk, couldn't communicate with any of the doctors or nurses that were pushing me through all sorts of machines and sticking pointy things into my body. I clearly remember a moment in which I was left alone on an uncomfortable stretcher in an ice cold room with lots of needles and cables sticking out of me, my head almost exploding, and I was crying because I had no idea where this was going and I was afraid. Anxiously wondering if I'll survive this day at all, if I'll ever see my loved ones again or could talk to them with the voice I had lost, if I will be mentally disabled for the rest of my life. And the only thing I could do about it was to scream at the top of my lungs.\n\nIt went on like this until in the evening someone rammed a huge needle into my spine to take some cerebrospinal fluid for diagnostic testing and then later told me that I had a brain inflammation. Great, what does that mean? Apparently it meant that I had to stay at the hospital for a few weeks for a antibiosis therapy until they had found and eliminated the pathogen that was causing the inflammation. My ability to speak and to move my body normally were returning quite quickly after the first attack, but the severe headaches stayed. Unfortunately, they also didn't find the type of bacteria or virus that was causing the inflammation. I cannot judge if that is common, but it didn't feel like the doctors were putting in a lot of effort (or even interest). No one there was really caring much about their patients; delivering meals and medicine three times a day and that was it. Due to the Covid pandemic, no visitors were allowed either, so it were some pretty lonely weeks under constant pain.\n\n### Wonderful people\n\nI don't know in which mental condition I would be now, if it wasn't for a lot of great people around me. Despite not being allowed to enter the hospital, my girlfriend and flatmates dropped of everything I needed (first and foremost my laptop :D) in front of its doors and later on met me outside with enough distance in between. My friends, many of them from the other end of the world, checked in on me and called me to tell me that everything is going to be alright. My grandma called every day to tell me that I have to keep fighting the illness. Even my manager and some coworkers called frequently to ask how I was doing and wish me to get well soon. I was worried about how they would perceive me being absent from work for almost four weeks, but everyone assured me that it was fine and I should focus on healing and not mind work at all. After leaving the hospital, still with headaches and not knowing which particular pathogen caused the inflammation, it wasn't easy to make the first small steps back into normal life. But again, everyone at home and at work was very understanding about me not yet being back to 100% performance or having to spontaneously go to emergency room again. Slowly but steadily things were getting better, I started exercising and meditating again, the headaches were getting less severe, and I gained back my strength.\n\n### Acceptance\n\nIt was a terrible time for me but it could have been much worse if I hadn't had my loved ones and colleagues cheering me up. I also had to learn to cheer myself up, accept my situation, and just make myself believe (reasonable or not) that things were going to get better eventually. Of course, my head was full of dark questions that I had to keep answering to myself. Why did it hit me? What did I do? Nothing, randomness, and accident, it just happened and now I have to accept it. Will I ever be like I was before? Will I ever fully recover from the effects of this brain disease? I don't know, I hope so. At least I was sure that worrying about it isn't going to accelerate my recovery, it would just make me feel worse. I didn't have any active influence over the course of this disease; the only thing I could do is to bare it, accept my fate, and hope for better future days to come. And better days came. And I was dragged there by the people who care for me and my own stubborn, trying-to-be-equanimous mind. Now, two months after the whole thing stated, I hopefully got the very last spinal tap of my life and the result doesn't show a significant inflammation of my brain anymore. Almost healthy again, just a little bit of headaches left, finally.\n\n### What I've learned\n\nI guess I just felt like writing this down for myself. But also to tell anyone who's going through though times (and that's probably a lot of people with the pandemic still raging): Keep your head up! You mustn't lose hope. Bad things may happen just like that and, often, there's nothing we can do to change that. The Covid pandemic is the best example for that. But we can try to make the best out of it and hope for better times. Keep the people that care for you and that you care for as close as social distancing allows it. Try to be not alone. And try to accept your fate; instead of ruminating how terrible things are all the time, try to think about what you can do to make it better and then stop caring about the rest. Look forward and believe that, eventually, those better days will come. This may all sound a bit cheesy, but it is what helped me getting through almost two months of pain and hardship without becoming depressed or insane. Thanks to all the folks who helped me getting better! And thanks for reading, stay safe, healthy, happy :)\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Machine Learning, NLP, and Toxic Language Classification","An introductory talk I gave at the CODEx2020 Developer & IT-Expert Conference",new Date("2020-11-18"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/codex2020/codex_thumbnail.png","Come learn about machine/deep learning and natural language classification!",["Data Science & AI/ML","Learning"],'### Check out the talk\nAt the CODEx2020 Developer & IT-Expert Conference I gave a talk about Machine Learning, Natural Language Processing (NLP), and Toxic Language Classification with [ToxBlock](http://www.pascal-bliem.com/tox-block), a Python deep learning application I built last spring.\n\nI give an introduction to machine/deep learning, go through the basics of natural language processing, talk about [ToxBlock](http://www.pascal-bliem.com/tox-block), my toxic language classification package, and finally give a demo. If you\'re not a machine learning or NLP practitioner already, this recording may be a decent explanation of the fundamentals to you.\n\nYou can find a ToxBlock project page with links to Github repositories, PyPI, and blog posts [here](http://www.pascal-bliem.com/tox-block). And here\'s the recording of the talk:\n\n
\n\n
\nFeel free to contact me if you have any question, remarks, or ideas on the topic. \n\nThe icons in the presentation were kindly provided free of cost by Eucalyp, Freepik, and Nhor Phai at [FlatIcon](https://www.flaticon.com/).\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("The Diarysta Frontend","Building a responsive single-page app with React and Materialize",new Date("2020-10-26"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/diarysta-frontend/diary.jpg","How did you feel? What did you do? And how does it correlate? Keep track with Diarysta!",["Web Development"],'**TL;DR**: I built a diary app called Diarysta in which you can track your daily moods and activities and get a graphical summary of your personal diary-stats. It is a MERN stack (MongoDB, Express, React, Node.js) application and in this blog post, I will discuss the design of its frontend/user interface for the web browser. I have previously written [another post on its backend](http://www.pascal-bliem.com/blog/the%20diarysta%20backend). You can visit a demo of the app [here](https://diarysta.onrender.com/) and also have a look at its [Github repo](https://github.com/Pascal-Bliem/diarysta).\n\n### Keep track of your mood\n\nDo you know how your mood fluctuates over time or how it correlates with your every-day activities? Traditionally, that\'s what people have diaries for. But hey, wouldn\'t it be cooler to use an app that let\'s you track those things in a way that is neatly organized, updatable, searchable, and gives you a graphic summary of how you\'ve felt and what you\'ve done? Yes, it would! So that\'s what Diarysta is all about. A web app that let\'s you create diary entries in which you specify your mood and select the activities you\'ve done, all in your browser. And it comes in English, German, and Indonesian! You can find Diarysta\'s source code on [Github](https://github.com/Pascal-Bliem/diarysta). You can find a demo of the actual project [here](https://diarysta.onrender.com/). Note that the hosting instance hibernates after long inactivity, and it may take a few seconds for it to wake up and make the app available.\n\nThe app is build with the MERN stack, which stands for [**M**ongoDB](https://www.mongodb.com/), [**E**xpress](https://expressjs.com/), [**R**eact](https://reactjs.org/), and [**N**ode.js](https://nodejs.org/). MongoDB is a NoSQL document-based database, Node.js is a JavaScript runtime environment that executes JavaScript code outside a web browser, and Express is minimalist web framework that runs on Node. I\'ve already discussed those backend components in a [previous post](http://www.pascal-bliem.com/blog/the%20diarysta%20backend). In this post I will talk about the frontend which has been built with React, a JavaScript library for building user interfaces or UI components, and [Materialize](https://materializecss.com/), a responsive front-end framework based on [Material Design](https://material.io/). I will talk about implementation details on a high level, but not go down into the source code. Describing the whole code base would be way too verbose, so I\'ll try to provide images of the UI components, explain what\'s going on with them, and link to the corresponding file in the Diarysta [Github repo](https://github.com/Pascal-Bliem/diarysta).\n\n### Thinking about functionality and state\n\nWhat does a diary app need to do and how do we let all components of the application know whats going on? React apps are generally composed of [components](https://reactjs.org/docs/components-and-props.html), which are independent, reusable, and conceptually isolated pieces of the UI. Individual components can hold [state](https://reactjs.org/docs/state-and-lifecycle.html) to keep track of whats going on inside of them, and they can also pass down this information as properties (or just props) to their child components. However, if there are many components at different levels of the UI hierarchy that need to access the same state information, this can get messy pretty quickly. It would be much better if there was a single source of truth that "hovers" over all components. For very large applications, a popular library for state management is [Redux](https://redux.js.org/), but React also has its own [Context-API](https://reactjs.org/docs/context.html), which I use in this project. It provides a way to pass data through the component tree without having to pass props down manually at every level by making the state of interest available through the context everywhere.\n\nWe want to be able to create/login users and check if they\'re authenticated to make private routes accessible. So, first of all we need a [context for authentication](https://github.com/Pascal-Bliem/diarysta/tree/master/client/src/context/auth) (I will now always link to the components/files I\'m talking about), which will keep track of the user, its token, and its state of authentication. It will also provide all necessary functions calling the API endpoints for registration, login, logout, etc. Then, of course, we want to read all of our diary entries, filter them for search terms, create new ones, and maybe update or delete old ones. We\'ll need an [entry context](https://github.com/Pascal-Bliem/diarysta/tree/master/client/src/context/entry) that keeps track of all this and provides the functions that\'ll do the corresponding request to our backend API. Last but not least, we want the whole UI to be available in three languages: English, German, and Indonesian (which are the only languages I\'m more or less fluent in). We need a [locale context](https://github.com/Pascal-Bliem/diarysta/tree/master/client/src/context/locale) for keeping the users\' locale, saving it to users\' local storage so that it\'ll be remembered and of course, keeping all the [translations](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/config/translations.js). Seriously, when I started this project, I had no clue how incredibly annoying it would be to create a trilingual application, having to type every little UI text three times. Anyway, let\'s [login](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/auth/Login.js) or [register](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/auth/Register.js), select the [language](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/layout/LanguageMenu.js) you need, and take a tour through the app!\n\nSelect your language and login or register!\n\n### Filling Diarysta with entries\n\nOnce you\'re logged in, you\'ll find yourself on the main [entries](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/pages/Entries.js) page. You can get to other pages through the [navbar](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/layout/Navbar.js) without even having to send new request to the server, since Diarysta is a single-page application and all routing is done on the front end by the [React Router](https://reactrouter.com/) library. If you have already created any entries, you\'ll find them here, sorted by date of creation. All of a user\'s entries are fetched via an API call to the Diarysta backend, which gets them from the database. Besides a date, every [entry](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/EntryItem.js) also has a [mood](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/Mood.js), represented by a matching emoji, which corresponds to how you felt when creating the entry. Optionally, you can select [activities](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/Activities.js) you did that day and add a note. You\'ll be able to search all posts for keywords as well by using the [search bar](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/EntryFilter.js) on top of the page.\n\nRead and search your diary entries.\n\nIn case you haven\'t created any entries yet, you can do so by clicking the action button on the bottom right. By default, this button will preselect the current date for the entry, but if you hover over it, it\'ll show you two more buttons for creating an entry for yesterday or any date by redirecting to the calendar page. This is just for convenience, as you can also change the date (as long as it\'s not in the future) in the [form](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/AddEntry.js) that pops up after you clicked. You\'ll also have to select a mood (it\'ll complain if you don\'t) and you can select various activities from categories such as health, hobbies & studying, social, food, and chores. Those activities will show up as tags on your entry. Last but not least you can write down what you did that day in a note. You can open the same form with pre-filled fields if you want to [edit](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/entries/UpdateEntry.js) an entry, and you can, of course, also delete it.\n\nAdd a new entry and select a mood, activities, and a note.\n\nIf you want to get an overview of what happened when, you can navigate to the [calendar page](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/pages/Calendar.js) through the [navbar](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/layout/Navbar.js). You can pick a date, as long as it\'s not in the future, and see which entries you created or chose to create a new one for that date.\n\nLook up, create, or delete entries for certain days.\n\n### Your personal statistics\n\nIf you want to get a cool graphical overview of how your mood developed, what you have done how often, and how your activities correlate with your mood, take a look at the [stats page](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/pages/Stats.js). On the top, you\'ll see [two counters](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/DaysInRow.js) that\'ll tell for how many days in a row, from today, you\'ve been adding entries and how long your longest chain of entries was, respectively. Below you\'ll find a couple of charts (using the [Chart.js](https://www.chartjs.org/) library) which were constructed from your diary entries. For all charts, the radio buttons below let you chose if you want to consider a range of the last 7, 30, or 365 days when calculating the stats. The [first chart](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/MoodChart.js) shows you how your mood developed over the selected time range.\n\nSee how many days in row you added entries and how your mood developed over time.\n\nGoing further down, you got a [bar chart](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/ActivityCount.js) that counts how often you did which activity, and below that, another [bar chart](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/ActivityMoodCorrelation.js) which displays the average mood you were in when performing a certain activity.\n\nSee which activities you did how often and what your average mood was when doing these activities.\n\nThere\'s one more bar chart that shows you your [average mood per day of the week](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/AverageDailyMood.js) (how bad are your Mondays?) and, finally, a [doughnut chart](https://github.com/Pascal-Bliem/diarysta/blob/master/client/src/components/stats/MoodCount.js) that simply display which fraction each mood option takes in your overall state of mind.\n\nSee your average mood per day of the week and your overall mood count.\n\n### Conclusion\n\nAnd that\'s basically it! A MERN stack app that let\'s you keep track of your mood and activities, and gives you a nice graphical overview of them. In a [previous post](http://www.pascal-bliem.com/blog/the%20diarysta%20backend) I have already discussed how to create a backend API with Express and Node.js and connect it to a MongoDB database. In this post, we\'ve seen an example of a single-page React app which connects to this API to only send and receive data and does everything else by itself on the frontend. I hope you found it interesting to read and maybe it inspires you to start keeping a diary yourself. If you\'re interested to look at it in detail, check out the [demo project](https://diarysta.onrender.com/stats) or Diarysta\'s [Github repo](https://github.com/Pascal-Bliem/diarysta). Thanks a lot for reading!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("The Diarysta Backend","Building the backend and REST-API for the Diarysta diary app with Node.js, Express, and MongoDB",new Date("2020-09-24"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/diarysta-backend/Blog-Article-MERN-Stack.jpg","Let's build a MERN stack app!",["Web Development"],'**TL;DR**: I built a diary app called Diarysta in which you can track your daily moods and activities and get a graphical summary of your personal diary-stats. It is a MERN stack (MongoDB, Express, React, Node.js) application and in this blog post, I will discuss the design and implementation of the back end. There will be another post on the frontend/user interface soon. You can visit a demo of the app [here](https://diarysta.onrender.com/) and also have a look at its [Github repo](https://github.com/Pascal-Bliem/diarysta).\n\n### Keep track of your mood\n\nDo you know how your mood fluctuates over time or how it correlates with your every-day activities? Traditionally, that\'s what people have diaries for. But hey, wouldn\'t it be cooler to use an app that let\'s you track those things in a way that is neatly organized, updatable, searchable, and gives you a graphic summary of how you\'ve felt and what you\'ve done? Yes, it would! So that\'s what Diarysta is all about. A web app that let\'s you create diary entries in which you specify your mood and select the activities you\'ve done, all in your browser. You can find Diarysta\'s source code on [Github](https://github.com/Pascal-Bliem/diarysta). If you\'re not interested in how it\'s technically implemented, you may stop reading right here and just check out the actual project [here](https://diarysta.onrender.com/). Note that the hosting instance hibernates after long inactivity, and it may take a few seconds for it to wake up and make the app available.\n\nThe app is build with the MERN stack, which stands for [**M**ongoDB](https://www.mongodb.com/), [**E**xpress](https://expressjs.com/), [**R**eact](https://reactjs.org/), and [**N**ode.js](https://nodejs.org/). MongoDB is a NoSQL document-based database, Node.js is a JavaScript runtime environment that executes JavaScript code outside a web browser, and Express is minimalist web framework that runs on Node. Those are the components I\'ll talk about in this post. In an upcoming post, I\'ll discuss the frontend of the app, which is written in React, a JavaScript library for building user interfaces or UI components. In the following, I will discuss the technical details of this project, how its designed, and in some cases how it is actually implemented. Since the code base is quite large, I will not discuss every bit of it down to the source code, but I\'ll provide code snippets wherever I find them useful.\n\n### Defining the data models\n\nLet\'s think for a moment about what we actually want to do in the app, what will be the core functionality? We want users be able to register or login and compose, read, update, and delete diary entries. Clearly, we\'re dealing with two principle entities: the user and the (diary) entry. For these entities we have to create database models. MongoDB is a document-based database (in contrast to e.g. relational SQL database), so the data model reminds a lot of a Javascript object. This has the advantage of being well suited for unstructured data (such as text), having a flexible and easily extendible schemas and high scalability. It is, however, less suitable for complex queries and analytics. The documents or objects in MongoDB cannot be joined like tables in a SQL database but we can refer to a document within another document with its unique ID; that way we can still relate users to their entries. For a user, we\'ll need her name, email (which will serve as a login), password, and date of creation. For an entry, we\'ll need a reference to the user that created it, date of creation, the user\'s mood, a selection of activities which the user carried out that day, as well as a filed for free text notes. Using [Mongoose](https://mongoosejs.com/), a fantastic object modeling library for MongoDB, the schemas would look like this:\n\n```javascript\nconst mongoose = require("mongoose");\n\nconst UserSchema = mongoose.Schema({\n name: {\n type: String,\n required: true,\n }, email: {\n type: String,\n required: true,\n unique: true,\n }, password: {\n type: String,\n required: true,\n }, date: {\n type: Date,\n default: Date.now,\n },\n});\n\nconst EntrySchema = mongoose.Schema({\n user: {\n type: mongoose.Schema.Types.ObjectId,\n ref: "user",\n }, date: {\n type: Date,\n default: Date.now,\n }, mood: {\n type: Number,\n required: true,\n }, activities: {\n type: [String],\n default: [],\n }, note: {\n type: String,\n default: "",\n },\n});\n\nmodule.exports = {\n user: mongoose.model("user", UserSchema),\n entry: mongoose.model("entry", EntrySchema),\n};\n```\n\n### API endpoints & authentication\n\nAfter we\'ve defined the data models and connected the database to the Node.js server, we can think about how we\'ll be able to get data in and out of the database. We\'ll use [Express](https://expressjs.com/), a minimal and flexible Node.js web application framework, to build the API that our frontend will later contact. For every "bit" of functionality, everything that we want to do with the data, we\'ll create an API endpoint, which means that we\'ll have to create routes to these endpoints in Express. I won\'t go through all the source code of the routes as that would get way to verbose, but you can imagine that in each route, we process some data and put it into or out of the database. You can still check out the full coe on [Github](https://github.com/Pascal-Bliem/diarysta). So, what exactly is it that we need to do? First of all, we need to be able to create a user so we\'ll need a `POST api/users` route. Then we need to be able to login the user (lets call the route `POST api/auth`) and get the logged in user back (`GET api/auth`). For that last route, we need a way to authenticate the user. I chose to use [JSON web tokens](https://jwt.io/). When the user logs in successfully, the server sends back the token in the response:\n\n```javascript\nconst jwt = require("jsonwebtoken");\n\n// some code to authenticate the user ...\n// if authentication was successful:\n\njwt.sign(user.id, config.get("jwtsecret"), { expiresIn: 36000 }, (err, token) => {\n if (err) throw err;\n res.json({ token });\n});\n```\n\nThe token will be stored in local storage on the client and will be send (if present) in a custom `x-auth-token` header with each request. On the server-side, each request to a non-public route (such as `GET api/auth`), will go through authentication middleware that looks something like this:\n\n```javascript\nconst jwt = require("jsonwebtoken");\nconst config = require("config");\n\nmodule.exports = (req, res, next) => {\n // Get token from header\n const token = req.header("x-auth-token");\n\n // check if there\'s no token\n if (!token) {\n return res.status(401).json({ message: "No token, authorization failed." });\n }\n\n try {\n // verify the token\n const decoded = jwt.verify(token, config.get("jwtsecret"));\n req.user = decoded.user;\n next();\n } catch (error) {\n return res.status(401).json({ message: "Token is not valid." });\n }\n};\n```\n\nNow that we got the user-related routes and authentication taken care of, we can have a look at the entries. We\'ll need to be able to create an entry (`POST api/entries`), update it (`PUT api/entries/:id`), delete it (`DELETE api/entries/:id`), and get all of a user\'s entries (`GET api/entries`). The route parameter `:id` in the URLs is the entry\'s unique ID in the database; this way we can access an individual existing entry. Whenever we create, update, or receive entries, the body of the request or response will contain entry objects in JSON format:\n\n```javascript\n// An example for a diary entry object\n{\n "user_id": "5f91be79f323e992seh534ze8534640cb",\n "date": "2020-10-22T17:16:22.599+00:00",\n "mood": 4,\n "activities": ["sports", "languages", "date"],\n "note": "What a great day do be alive and cook some nice food!"\n}\n```\n\n### A few last tweaks\n\nRegarding the actual API, that\'s actually all we have to do. The frontend will now have all the endpoints it needs to work with data on users and their diary entries. There are still a few more things to add to make the backend production ready. First of all, I want to add a simple health check route to be able to check if the app is up and running as expected:\n\n```javascript\n// health endpoint\napp.get("/health", (req, res) => {\n res.status(200).send("ok");\n});\n```\n\nThis is going to be particularly useful as I\'m planning to host the app on [Render\'s](https://www.render.com/) free tier, on which instances go into hibernation after some time of inactivity. Last but not least, I need to serve the actual frontend/user-interface to the client. Traditionally, that would mean that I\'d have to do a whole lot of additional routing to serve the client different files while she\'s navigating through the pages of my app. However, I\'m using React to build a single-page application, which "interacts with the user by dynamically rewriting the current web page with new data from the web server, instead of the default method of the browser loading entire new pages" ([Wikipedia](https://en.wikipedia.org/wiki/Single-page_application)). That means all of the routing will actually happen within the frontend and the server will only have to serve this one single-page app:\n\n```javascript\n// assuming the app\'s production build is in client/build/index.html\napp.use(express.static("client/build"));\napp.get("*", (req, res) => {\n res.sendFile(path.resolve(__dirname, "client", "build", "index.html"));\n});\n```\n\nNow, that\'s really all. We\'ve set up the API with basic CRUD (create, read, update, and delete) functionality and the rest of the action is going to happen on the frontend. This should demonstrate how the use of single-page apps that handle routing on the frontend allow for the backend to be simple, [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer), and able to agnostically serve multiple different web or mobile frontends. I know, I\'ve skipped a lot of details here for the sake of an improved reading experience; please have a look at Diarysta\'s [Github repo](https://github.com/Pascal-Bliem/diarysta) if you want to have a more in-depth look at the code. In the next blog post I will discuss the Diarysta frontend in depth, so stay tuned, and thanks a lot for reading!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Starfleet assimilated by the ChatBorg","A sequence-to-sequence deep learning approach to generating Star Trek dialogues",new Date("2020-08-18"),"https://i.ytimg.com/vi/Hno3K8H7U5g/maxresdefault.jpg","We are the ChatBorg. Training your neural networks is futile. Your vanishing gradients will be assimilated.",["Data Science & AI/ML","Learning"],"**TL;DR:** In this project I tried to train a sequence-to-sequence deep learning model on scripts of all the Star Trek TV shows with the aim to let it generate artificial Star Trek dialogues. I used an encoder-decoder long short-term memory (LSTM) network architecture, which is commonly found in task such as machine translation, question answering, and simple chatbots. The idea is to pass a dialogue line into the encoder and let the decoder generate an appropriate response. Long story short, it didn't work very well and only generates very short sentences without much meaning. I think this may be due to the data set being heavily dominated by few-words-long dialogue lines. Furthermore, free dialogue generation, as opposed to tasks with a clear request-response targets (such as translation or question answering) requires much more of a language understanding. For this, my model (due to my computational resource limitations) is way too simple, and large pretrained, transformer-based models may be a more suitable choice. So if you were looking for cool results, you may stop reading here. However, I found this project to be very educational and a great introduction to seq2seq models, so I decided to turn it into a blog post anyway. If you're interested in it for the sake of learning, please keep reading.\n\n### Introduction\n\nBeing a scientist, I guess it's not much of a surprise that I'm a big nerd and I love all sorts of Scify stuff. Star Trek is definitely one of my favorite franchises (though I love Star Wars as well) and I was very happy when I came across a [data set on Kaggle](https://www.kaggle.com/gjbroughton/start-trek-scripts) that contained raw text scripts of all Star Trek series episodes, scraped from [this](http://www.chakoteya.net/StarTrek/index.html) website. I recently took an online course titled [Deep Learning: Advanced NLP and RNNs](https://www.udemy.com/course/deep-learning-advanced-nlp/) in which the lecturer gave an introduction to sequence-to-sequence (seq2seq) models and used an LSTM-based encoder-decoder model to do neural machine translation from English to Spanish. It worked reasonably well for sentence to sentence translation and I wondered if it may be able to generate dialogues based on the same kind of request-response pattern. Wouldn't it be cool if we could generate new Star Trek scripts? Well, as I already mentioned in the TL;DR, it didn't work very well, but I found it very educative and wanted to share it. Below, I will discuss the following sections:\n\n1. [Seq2seq Theory](#theory)\n2. [Preprocessing the Data](#data)\n3. [Building the Model for Training](#training)\n4. [Modifying the Model to Generate Text](#decoding)\n5. [Generating Dialogues](#generate)\n6. [Conclusion](#conclusion)\n\n\n\n### Seq2seq Theory\n\nWhen working with text, we usually consider a sentece (or a document) as a sequence of words, much like a time series of words. Usually, some kind of recurrent neural network (RNN) architecture is used to process sequences in deep learning (though transformer-based models have become the state of the art in natural language processing lately). These kind of networks carry a hidden or state, sort of a context, from processing one part of the sequence to the next, which allows them to account for dependencies throughout the sequence. This is conceptually shown in the figure below.\n\n![A RNN \"unrolled\" - passing a state or context from one part of the sequence to the next.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/startrek-chatbot/RNN-unrolled.png)\n\nThe RNNs we are using are called long short-term memory (LSTM) networks, which are a little more complicated than in the figure above, but essentially are based on the same principles: they carry states (called hidden and cell states for the LSTM) from one part of the sequence to the next to understand long-range dependencies. With these kind of networks, we can build models that have different input-output-relationships, some of which are depicted in the figure below. We may want to input maybe just one topic and get a matching paragraph generated (one-to-many) or read a paragraph and classify its topic (many-to-one). We may also want to train a model to find the most important signal among all sequence steps (many-to-many with same input and output lengths, potentially with global max pooling) or we want to input a sentence and get a different one as a response, such as in machine translation or question answering (many-to-many with variable input and output lengths).\n\n![Different relationships of input and output in sequence processing models.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/startrek-chatbot/reltionships.jpg)\n\nOur example of generating dialogue belongs to the latter set of problems. We want to put in a dialogue line and get a different one as an output, similar to a question answering scenario. For these type of problems, an encoder-decoder architecture is commonly applied. The idea behind it is the following: We have an encoder LSTM that takes in the input sequence and encodes it. Instead of caring about the regular output of this encoder we just take it's final hidden and cell state, which is basically the context of the input sequence, and use them as initial states for the decoder. The decoder is another LSTM layer which takes these states and its own predicted word from the previous sequence step as input. During training, we try to make it a bit easier for the decoder to learn the correct output sequences by applying a method called teacher forcing. In teacher forcing, we feed the target output sequence offset by one as input into the encoder. During prediction, however, the every next prediction step in the sequence gets the actual prediction (and hidden and cell states) from the previous step as an input. The concept of an encoder-decoder model with teacher forcing is illustrated in the figure below for the example application of machine translation.\n\n![An example of an encoder-decoder sequence-to-sequence model for machine translation.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/startrek-chatbot/seq2seq.svg)\n\nTo make the encoding of words into a numerical format a bit more effective than just using an integer, we'll use en embedding layer in front of both the encoder and decoder. The idea behind word embeddings is to encode the meaning of words as vectors in high dimensional space. Word vectors of words with a similar meaning (e.g. bee, wasp, bumblebee, hornet) should be close together in this space, vectors of words that have nothing to do with each other should be far apart, and vectors of words with opposite meanings like \"front\" or \"back\" should ideally pointing into the opposite direction. If you would subtract the \"male\" vector from the word \"king\" and add the \"female\" vector, you should end up somewhere close to the word \"queen\" (as sketched in the figure below). To train such embeddings well, one usually needs a lot of text; hence, it often makes sense to initialize the parameters of the embedding layer with pre-trained embeddings and then fine-tune them on the given task. I use the [GloVe](https://nlp.stanford.edu/projects/glove/) embedding here, which has been pre-trained on the Wikipedia and Gigaword 5 corpora.\n\n![Word embeddings encode the meaning of word into multi-dimensional vectors.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/startrek-chatbot/wordembedding.svg)\n\nFinally we put a dense layer with a softmax activation behind the decoder to find the most probable word from the vocabulary to chose. The model I'll use here is (partially to restrictions in computational resources or rather my unwillingness to spend a lot of money on them) is pretty simple. It is only one LSTM layer in both the encoder and decoder and dimensionalities in both the embedding layer and the LSTMs latent space were held fairly low. This may make it difficult for the model to actually learn some language understanding from the dialogue line. Anyway, we'll see - let's get started!\n\n```python\n# import libraries\n\n# computation & data\nimport numpy as np\nimport scipy\nimport sparse\nimport pandas as pd\nimport unicodedata\n# plotting\nimport matplotlib.pyplot as plt\nplt.rcParams['figure.dpi'] = 300\n#plt.rcParams['figure.figsize'] = [5.0, 7.0]\nimport seaborn as sns\nsns.set_style(\"darkgrid\")\n# deep learning\nimport tensorflow as tf\nimport tensorflow.keras.backend as K\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau\nfrom tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\n# standard python\nimport os, sys\nimport re\nfrom typing import List\n```\n\n\n\n### Preprocessing the Data\n\nHere we'll load in the data, have a look at it and do some preprocessing before we feed it into the model.\n\n```python\n# read in the data\ndata = pd.read_json(\"all_scripts_raw.json\")\n```\n\n```python\n# a look into the raw text of an episode of DS9\ndata[\"DS9\"].loc[\"episode 101\"][:1000]\n```\n\n \"\\n\\n\\n\\n\\nThe Deep Space Nine Transcripts - Trials and\\nTribble-ations\\n\\n\\n\\nTrials\\nand Tribble-ations\\nStardate:\\nUnknown\\nOriginal Airdate: 4 Nov, 1996\\n\\n\\n\\n\\n\\n\\n [Ops]\\n\\n(A pair of dour pin-striped bureaucrats arrive on\\nthe turbolift.) \\nKIRA: Welcome to Deep Space Nine. I'm Major Kira. \\nDULMUR: I'm Dulmur. \\n(An anagram of Muldur, and yes - ) \\nLUCSLY: Lucsly. Department of Temporal Investigations. \\nKIRA: We've been expecting you. \\nDAX: I guess you boys from Temporal Investigations are always on time. \\nDULMUR: Where's Captain Sisko?\\n\\n [Captain's office]\\n\\nSISKO: Are you sure you don't want anything? \\nDULMUR: Just the truth, Captain. \\nSISKO: You'll get it. Where do you want to start? \\nDULMUR: The beginning. \\nLUCSLY: If there is such a thing. \\nDULMUR: Captain, why did you take the Defiant back in time? \\nSISKO: It was an accident. \\nLUCSLY: So you're not contending it was a predestination paradox? \\nDULMUR: A time loop. That you were meant to go back into the past? \\nSISKO: Erm, no. \\nDULMUR: Good. \\nLUCSLY:\"\n\nWe can see that every speakers\u2019 line starts with a new line and their name in all caps followed by a colon, e.g. \\nSISKO: \nSo, we can use this pattern to split the individual lines:\n\n```python\n # use a regex to split the raw script into lines of individual speakers\n # the ' character in the regex is for O'BRIEN, the [OC] means over comm, I think\n def split_lines(raw_script: str) -> List[str]:\n return re.split(r\"(?:\\n[A-Z']+: )|(?:\\n[A-Z']+ \\[OC\\]: )\", raw_script)\n\n lines = split_lines(data[\"DS9\"].loc[\"episode 101\"])\n```\n\n```python\n# let's have a look\nlines[1:25]\n```\n\n [\"Welcome to Deep Space Nine. I'm Major Kira. \",\n \"I'm Dulmur. \\n(An anagram of Muldur, and yes - ) \",\n 'Lucsly. Department of Temporal Investigations. ',\n \"We've been expecting you. \",\n 'I guess you boys from Temporal Investigations are always on time. ',\n \"Where's Captain Sisko?\\n\\n [Captain's office]\\n\",\n \"Are you sure you don't want anything? \",\n 'Just the truth, Captain. ',\n \"You'll get it. Where do you want to start? \",\n 'The beginning. ',\n 'If there is such a thing. ',\n 'Captain, why did you take the Defiant back in time? ',\n 'It was an accident. ',\n \"So you're not contending it was a predestination paradox? \",\n 'A time loop. That you were meant to go back into the past? ',\n 'Erm, no. ',\n 'Good. ',\n 'We hate those. So, what happened? ',\n 'This may take some time. ',\n 'Is that a joke? ',\n 'No. ',\n 'Good. ',\n \"We hate those too. All right, Captain. Whenever you're ready. \",\n 'Two weeks ago the Cardassian Government contacted me and wanted\\nto return an Orb to the Bajorans. ']\n\nThere is still some cleaning to do here. We want to unicode-normalize everything to remove weird accents on alien names and such. There are still a couple of newlines \\n within the text and locations \\[Bridge\\] and scene descriptions (The turbo lift is full of bananas) are included in parenthesis. We'll also put whitespace between the last word of a sentence and the punctuation, remove trailing white spaces, and lowercase everything.\n\n```python\ndef preprocess_lines(lines: List[str]) -> List[str]:\n clean_lines = []\n for line in lines:\n # nomralize\n line = (unicodedata.normalize(u'NFKD', line).encode('ascii', 'ignore').decode('utf8'))\n # remove stuff in parenthesis\n line = re.sub(r\"\\(.*\\)\", \"\", line)\n line = re.sub(r\"\\[.*\\]\", \"\", line)\n # replace \\n and weird chars with space\n line = re.sub(r\"\\n\", \" \", line)\n line = re.sub(r\"[^a-zA-Z?.!,\xbf]+\", \" \", line)\n # put space before punctuation\n line = re.sub(r\"([?.!,\xbf])\", r\" \\1 \", line)\n line = re.sub(r'[\" \"]+', \" \", line)\n # strip and lowercase\n line = line.strip().lower()\n clean_lines.append(line)\n return clean_lines\n```\n\n```python\n# let's have a look at the cleaned lines\nclean_lines = preprocess_lines(lines)\nclean_lines[1:25]\n```\n\n ['welcome to deep space nine . i m major kira .',\n 'i m dulmur .',\n 'lucsly . department of temporal investigations .',\n 'we ve been expecting you .',\n 'i guess you boys from temporal investigations are always on time .',\n 'where s captain sisko ?',\n 'are you sure you don t want anything ?',\n 'just the truth , captain .',\n 'you ll get it . where do you want to start ?',\n 'the beginning .',\n 'if there is such a thing .',\n 'captain , why did you take the defiant back in time ?',\n 'it was an accident .',\n 'so you re not contending it was a predestination paradox ?',\n 'a time loop . that you were meant to go back into the past ?',\n 'erm , no .',\n 'good .',\n 'we hate those . so , what happened ?',\n 'this may take some time .',\n 'is that a joke ?',\n 'no .',\n 'good .',\n 'we hate those too . all right , captain . whenever you re ready .',\n 'two weeks ago the cardassian government contacted me and wanted to return an orb to the bajorans .']\n\nCool, that seems to have worked, so let's apply it to all the data:\n\n```python\n# we'll store all episodes' processed scripts in here\nall_episodes = []\n\nfor col in data.columns:\n for raw_script in data[col][data[col].notna()].values:\n all_episodes.append(preprocess_lines(split_lines(raw_script)))\n```\n\nLet's explore our corpus a bit. I'd like to know how many lines we have in total and how long they usually are.\n\n```python\nnum_lines = 0\nline_lengths = []\n\nfor clean_script in all_episodes:\n num_lines += len(clean_script)\n for line in clean_script:\n line_lengths.append(len(line.split()))\nprint(f\"Number of lines: {num_lines}\")\n```\n\n Number of lines: 250708\n\n```python\n# let's see how the line lengths are distributed (we wont )\nline_lengths = np.array(line_lengths)\nsns.distplot(line_lengths[line_lengths<1000], kde=False)\nprint(f\"Number of words at the 0.99 quantile: {np.quantile(line_lengths, 0.99)}\")\n```\n\n Number of words at the 0.99 quantile: 80.0\n\n![The distribution of dialogue line lengths in words.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/startrek-chatbot/seqlenhist.png)\n\nWe can see that we have about 250000 lines in total and that in 99% of the cases, the lines are not longer than 80 words. Therefore, I'll choose 80 as the maximum sequence length here and will later pad all shorter sequences with zeroes to this length. We can also see that half of the lines have only 11 words or less. Most of the dialogue lines seem to be very short, which means they'll need a lot of zero-padding later, which doesn't convey any information. This may make it difficult for the model to predict longer, more interesting sentences later.\n\nHere is probably also a good point to define the other parameters that will be used for the model. Besides the sequence length, I'll also define the maximum vocabulary size, dimensions for the word embedding and the latent LSTM encoding as well as training parameters such as batch size and number of epochs.\n\n```python\nBATCH_SIZE = 32 # batch size for training\nEPOCHS = 10 # number of training epochs\nLATENT_DIM = 128 # latent dimensionality of the LSTM encoding space.\nMAX_SEQ_LEN = 80 # the maximum length for the text sequences\nMAX_NUM_WORDS = 20000 # maximum vocabulary size for the word tokenizer\nEMBEDDING_DIM = 100 # dimensionality for the word embedding\n```\n\nSince we want to build a dialog system, we need to train it in a way that it'll output a sentence based on the previous sentence, and so on and so forth. That means we'll create pairs of input-target sequences form all lines and add start of sentence and end of sentence tokens to them. We also want to train the decoder using teacher forcing, so we'll need the target sequences offset by one with a token as well.\n\n```python\n# we'll store the input, target, and target_inputs for\n# teacher forcing in these lists\ninput_lines = [] # input lines\ntarget_lines = [] # target lines\ntarget_lines_inputs = [] # target lines offset by 1\n\n# for each episode\nfor clean_script in all_episodes:\n # get all input and target lines (skip first one\n # because it is episode title line, and last one\n # because no response line will follow)\n for i, input_line in enumerate(clean_script[1:-1]):\n # if shorter equal than MAX_SEQ_LEN\n if (len(input_line.split()) <= MAX_SEQ_LEN and\n len(clean_script[i+1].split()) <= MAX_SEQ_LEN):\n # add start/end token and append to respective list\n input_lines.append(\" \" + input_line)\n target_lines.append(clean_script[i+1] + \" \")\n target_lines_inputs.append(\" \" + clean_script[i+1])\n```\n\nNow we have all the lines we need, but they're still strings, and a neural net cannot process them like this. We'll tokenize (split up by words and punctuation in this case) the text and turn it into numeric sequences.\n\n```python\n# fit tokenizer on vocabulary\ntokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')\ntokenizer.fit_on_texts(input_lines + target_lines)\n\n# the word index will allow is to translate from numbers\n# back to actual words\nword_index = tokenizer.word_index\n\n# tokenize all three seq types\ninput_sequences = tokenizer.texts_to_sequences(input_lines)\ntarget_sequences = tokenizer.texts_to_sequences(target_lines)\ntarget_sequences_inputs = tokenizer.texts_to_sequences(target_lines_inputs)\n\n# to feed the sequences into the encoder and decoder,\n# we'll have to pad them with zeroes (left and right, respectively)\nencoder_inputs = pad_sequences(input_sequences, maxlen=MAX_SEQ_LEN, padding=\"pre\")\ndecoder_inputs = pad_sequences(target_sequences_inputs, maxlen=MAX_SEQ_LEN, padding='post')\ndecoder_targets = pad_sequences(target_sequences, maxlen=MAX_SEQ_LEN, padding='post')\n```\n\nEventually, we want to have a model that can predict which of the MAX_NUM_WORDS is the most likely to come next. It's basically a many-class classification problem and we should probably use some form of categorical cross entropy as a loss function. Keras has a loss function called `SparseCategoricalCrossentropy` which allows to just encode the targets as integers, but as of now, it unfortunately doesn't work with sequence outputs. Therefore, I tried to use the regular `CategoricalCrossentropy`, which means that we'll have to one-hot encode the targets. I ended up using a custom version for the crossentropy to account for the zero-padding, but the one-hot encoding still applies. If we'd encode this in a dense array it wouldn't fit into memory, so we'll have to put it into a sparse tensor.\n\n```python\n# assign the one-hot values\ncoords = [[],[],[]]\ndata = []\nfor i, d in enumerate(decoder_targets):\n dim1 = i # which sequence\n for j, word in enumerate(d):\n dim2 = j # which position in the sequence\n if word != 0:\n dim3 = word # which word in the vocabulary\n coords[0].append(dim1)\n coords[1].append(dim2)\n coords[2].append(dim3)\n data.append(1.0)\n\n# pass values to a sparse tensor of the right shape\n# len(decoder_targets) x MAX_SEQ_LEN x MAX_NUM_WORDS\ndecoder_targets_one_hot = sparse.COO(coords, data,\n shape=(len(decoder_targets),\n MAX_SEQ_LEN,\n MAX_NUM_WORDS))\n```\n\n\n\n### Building the Model for Training\n\nWe will try to improve our models understanding of the English language by using pretrained word embeddings, GloVe in this case. We'll load in the word vectors and set them as weights for the word embedding layers in front of the encoder and decoder.\n\n```python\n# load the GloVe word vectors from file\nglove_vectors = {}\nwith open(f\"./glove.6B.{EMBEDDING_DIM}d.txt\", encoding=\"utf8\") as f:\n # the format of the file is: word vec[0] vec[1] vec[2] ...\n for line in f:\n values = line.split()\n word = values[0]\n vec = np.asarray(values[1:], dtype='float32')\n glove_vectors[word] = vec\n\n# create an embedding matrix from the vectors\nembedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))\nfor word, i in word_index.items():\n if i < MAX_NUM_WORDS:\n embedding_vector = glove_vectors.get(word)\n if embedding_vector is not None:\n # words not contained in embedding will be initialized as zero\n embedding_matrix[i] = embedding_vector\n```\n\nWe'll use this matrix as starting weights for the embedding layer.\n\n```python\n# the embedding layer\nembedding_layer = Embedding(\n MAX_NUM_WORDS,\n EMBEDDING_DIM,\n weights=[embedding_matrix],\n input_length=MAX_SEQ_LEN,\n trainable=True # we'll fine tune it\n)\n```\n\nNow it's finally time to build the actual model. We'll start with the encoder. It'll just be an LSTM layer, but instead of making any use of the outputs, we'll just keep the hidden and cell state of the LSTM, which we will later feed into the decoder as its initial states.\n\n```python\n# the encoder\nencoder_inputs_placeholder = Input(shape=(MAX_SEQ_LEN,))\nencoder_inputs_x = embedding_layer(encoder_inputs_placeholder)\nencoder = LSTM(\n LATENT_DIM,\n return_state=True,\n # dropout=0.5\n)\nencoder_outputs, h, c = encoder(encoder_inputs_x)\n\n# keep the hidden and cell states as\n# initial states for the decoder\nencoder_states = [h, c]\n```\n\nThe decoder will also be an LSTM layer which will predict a sequence (the text we want to generate) and uses the hidden and cell states from the encoder as initial states for itself. The decoder layer will return its states as well, which do not play any role during training, but they\u2019ll be needed when we reuse this layer to make predictions later. We'll also add a final dense layer with a softmax activation to choose predictions from the vocabulary.\n\n```python\n# the decoder\ndecoder_inputs_placeholder = Input(shape=(MAX_SEQ_LEN,))\ndecoder_inputs_x = embedding_layer(decoder_inputs_placeholder)\n\n# Contrary to the encoder, we want to generate text\n# as output, hence, return_sequences=True.\ndecoder_lstm = LSTM(\n LATENT_DIM,\n return_sequences=True,\n return_state=True,\n # dropout=0.5\n)\n\n# use the encoder states as the inital states\ndecoder_outputs, _, _ = decoder_lstm(\n decoder_inputs_x,\n initial_state=encoder_states\n)\n\n# dense layer with softmax to make predictions\ndecoder_dense = Dense(MAX_NUM_WORDS, activation='softmax')\ndecoder_outputs = decoder_dense(decoder_outputs)\n```\n\nNow we can plug all components together to build the model, compile it, and train it. We will also implement custom definitions of the loss function as well as the accuracy, which account for the zero paddings on the sequences. I've tried the regular categorical crossentropy before, but it perormed much worse than the custom version here.\n\n```python\n# combine components to a model\nmodel = Model(\n inputs=[encoder_inputs_placeholder, decoder_inputs_placeholder],\n outputs=decoder_outputs\n)\n\n# a custom loss that accounts for the zero-padding\ndef custom_loss(y_true, y_pred):\n mask = K.cast(y_true > 0, dtype='float32')\n out = mask * y_true * K.log(y_pred)\n return -K.sum(out) / K.sum(mask)\n\n# a custom accuracy that accounts for the zero-padding\ndef acc(y_true, y_pred):\n targ = K.argmax(y_true, axis=-1)\n pred = K.argmax(y_pred, axis=-1)\n correct = K.cast(K.equal(targ, pred), dtype='float32')\n\n # 0 is padding, don't include those\n mask = K.cast(K.greater(targ, 0), dtype='float32')\n n_correct = K.sum(mask * correct)\n n_total = K.sum(mask)\n return n_correct / n_total\n\n# compile the model\nmodel.compile(optimizer=Adam(learning_rate=0.005), loss=custom_loss, metrics=[acc])\nmodel.summary()\n```\n\n Model: \"model\"\n __________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to\n ==================================================================================================\n input_2 (InputLayer) [(None, 80)] 0\n __________________________________________________________________________________________________\n input_1 (InputLayer) [(None, 80)] 0\n __________________________________________________________________________________________________\n embedding (Embedding) (None, 80, 100) 2000000 input_1[0][0]\n input_2[0][0]\n __________________________________________________________________________________________________\n lstm (LSTM) [(None, 128), (None, 117248 embedding[0][0]\n __________________________________________________________________________________________________\n lstm_1 (LSTM) [(None, 80, 128), (N 117248 embedding[1][0]\n lstm[0][1]\n lstm[0][2]\n __________________________________________________________________________________________________\n dense (Dense) (None, 80, 20000) 2580000 lstm_1[0][0]\n ==================================================================================================\n Total params: 4,814,496\n Trainable params: 4,814,496\n Non-trainable params: 0\n __________________________________________________________________________________________________\n\nSince the targets are still in that huge sparse tensor, which apparently cannot directly be used for training, we'll supply the training batches via a data generator that densifies the targets in each batch. During training, Keras will call this generator function to get the training data and targets, batch by batch.\n\n```python\n# generator function to supply the training batches\ndef generate_data():\n counter=0\n while True:\n # select the batches\n encoder_batch = encoder_inputs[counter:counter+BATCH_SIZE,:]\n decoder_batch = decoder_inputs[counter:counter+BATCH_SIZE,:]\n target_batch = (decoder_targets_one_hot[counter:counter+BATCH_SIZE,:,:]\n .todense()\n .astype(np.float32))\n counter += BATCH_SIZE\n yield [encoder_batch, decoder_batch], target_batch\n\n # restart counter to yeild data in the next epoch as well\n if counter >= 1000:#len(decoder_targets):\n counter = 0\n```\n\n```python\nmodel_checkpoint_callback = ModelCheckpoint(\n filepath=\"./star_trek_chatbot_checkpoint.h5\",\n save_weights_only=False,\n monitor=\"loss\",\n mode=\"min\",\n save_best_only=True)\n\nreduceLR = ReduceLROnPlateau(\n monitor=\"loss\", factor=0.5, patience=2, verbose=0, mode=\"auto\",\n min_delta=0.0001, cooldown=0, min_lr=0\n)\n\n# train the model\nhistory = model.fit(\n generate_data(),\n steps_per_epoch=len(decoder_targets) // BATCH_SIZE,\n epochs=EPOCHS,\n callbacks=[model_checkpoint_callback, reduceLR],\n verbose=0\n)\n```\n\n```python\n# Save model\nmodel.save(\"./star_trek_chatbot.h5\")\n```\n\n\n\n### Modifying the Model to Generate Text\n\nNow that the model is trained, we want to make predictions. We still have to set up the generative functionality though. We need to take the components we already used and put them together to a new model in which the decoder will always predict one word at a time, taking the hidden and cell states as well as the previously predicted word as an input for the next prediction. This will be repeated until the end-of-sentence token appears.\n\nThe encoder will look just like before, but it will stand alone now and just be used for getting the initial states for the decoder.\n\n```python\n# the encoder consists of the same components as before\nencoder_model = Model(\n inputs=encoder_inputs_placeholder,\n outputs=encoder_states\n)\n```\n\nHere we have to set up the decoder components which will later be used in the prediction loop. We are using the previously trained decoder_lstm layer here and feed it the states and the outputs of the previous prediction in the sequence.\n\n```python\n# the hidden and cell states\ndecoder_state_input_h = Input(shape=(LATENT_DIM,))\ndecoder_state_input_c = Input(shape=(LATENT_DIM,))\ndecoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]\n\n# the regular input of length 1 (the previously predicted word)\ndecoder_inputs_single = Input(shape=(1,))\ndecoder_inputs_single_x = embedding_layer(decoder_inputs_single)\n\n# getting the outputs (a predicted word) and the states,\n# which will both be used as input for predicting the next word\ndecoder_outputs, h, c = decoder_lstm(\n decoder_inputs_single_x,\n initial_state=decoder_states_inputs\n)\n\ndecoder_states = [h, c]\ndecoder_outputs = decoder_dense(decoder_outputs)\n\n# the decoder sampling model\n# inputs: y(t-1), h(t-1), c(t-1)\n# outputs: y(t), h(t), c(t)\ndecoder_model = Model(\n inputs=[decoder_inputs_single] + decoder_states_inputs,\n outputs=[decoder_outputs] + decoder_states\n)\n```\n\nThe model will, of course, output numbers, not strings. Therefore, we will have to reverse the word index of the tokenizer to be able to translate from indicies back to actual words.\n\n```python\n# map indicies back to actual words\nindex_to_word = {value:key for key, value in word_index.items()}\n```\n\nThe last step is to set up the text sequence prediction loop. We'll pass the input text through the encoder to get the initial states for the decoder and always start by feeding a start-of-sentence token as input to the decoder. The loop will then generate predictions, one word at a time, until the end-of-sequence token is predicted.\n\n```python\ndef decode_sequence(input_seq):\n # encode the input to get the states\n states_value = encoder_model.predict(input_seq)\n\n # start with empty target sequence of length 1.\n target_seq = np.zeros((1, 1))\n # and fill with \n target_seq[0, 0] = word_index[\"\"]\n\n # if end is predicted, we break the loop\n end = word_index[\"\"]\n\n # the predicted output text\n output_sentence = []\n\n for _ in range(MAX_SEQ_LEN):\n output_tokens, h, c = decoder_model.predict(\n [target_seq] + states_value\n )\n\n # get next word index\n idx = np.argmax(output_tokens[0, 0, :])\n\n # if index corresponds to end, break\n if end == idx:\n break\n\n word = \"\"\n if idx > 0:\n word = index_to_word[idx]\n output_sentence.append(word)\n\n # update the decoder input with the word just predicted\n target_seq[0, 0] = idx\n\n # update states\n states_value = [h, c]\n\n return ' '.join(output_sentence)\n```\n\n\n\n### Generating Dialogues\n\nNow that everything is set up, let's try to produce some artificial neural Star Trek dialogues! I'll pick some examples that could be from Deep Space 9.\n\n```python\ntexts = [\" hello captain, how do you like the station ?\",\n \" have you seen jake ? \",\n \" the cardassians are heading towards the wormhole !\",\n \" after fifty years of occupation , bajor is finally indipendent .\"]\n\nfor text in texts:\n input_seq = pad_sequences(tokenizer.texts_to_sequences(text),maxlen=MAX_SEQ_LEN)\n print(text[8:])\n print(decode_sequence(input_seq) + \"\\n\")\n```\n\n hello captain, how do you like the station ?\n the ship .\n\n have you seen jake ?\n jennifer ?\n\n the cardassians are heading towards the wormhole !\n cancelled ?\n\n after fifty years of occupation , bajor is finally indipendent .\n dabo !\n\nWhile the results are generally rather disappointing (I've tried a bunch of other examples as well), I do love the last onw here; the Cardassians are gone, let's play Dabo! :D\n\n\n\n### Conclusion\n\nIn conclusion, I've tried to use a very simple seq2seq model based on an LSTM encoder-decoder architecture. It didn't work really well in terms of producing intersting new Star Trek dialougues that even Gene Roddenberry couldn't have dreamt of. Instead it spits out very short replies that do not neccessarily have much to do with the input. I think this may be due to the data set being heavily dominated by few-words-long dialogue lines. Furthermore, free dialougue generation, as opposed to tasks with a clear request-response targets (such as translation or question ansering) requires much more of a language understanding. For this, my model is way too simple. One the one hand, because I intentionally kept the number of components and parameters relatively small to keep it computationally tractable, on the other hand because the architecture itself may be a little to simple for free language generation. \n\nWith only the final cell state being passed from the encoder to the decoder, it means that the entire context from the encoding of the input has to be compressed into a single vector. Much of what was relevant along the sequence may have gotten lost at the final state, hence, not being available as useful input for the decoder. Using a seq2seq architecture that applies a [attention mechanism](https://arxiv.org/pdf/1902.02181.pdf) may perform better. Especially, very large [transformer](https://arxiv.org/abs/1706.03762)-based language models (which also use attention), such as Google's [BERT](https://arxiv.org/abs/1810.04805) or OpenAI's [GPT3](https://arxiv.org/abs/2005.14165) are excelling at language understanding and text generation. Maybe I'll try to pick this project up again, starting from such a giant, pretrained transformer model. Nonetheless, I found this project to be very educative and a great introduction to seq2seq RNN-based models. I've you made it until here, thanks a lot for reading, and may you boldly go where no human has gone before :)\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("The ToxBlock API: bring ML models into action by serving them on the web","Wrapping ToxBlock's functionality into a REST API, containerize it with Docker, and deploy it to the cloud",new Date("2020-07-12"),"https://www.cloudways.com/blog/wp-content/uploads/Rest-API-introduction.jpg","Bringing machine learning from research to production with containers and APIs.",["Data Science & AI/ML","Web Development"],'**TL;DR:** I built the open-source [ToxBlock API](https://github.com/Pascal-Bliem/tox-block-api), which serves [ToxBlock](https://github.com/Pascal-Bliem/tox-block), a Python deep learning package for recognizing toxic language in text, as a web service. If you\'re only interested in the usage of the API, feel free to jump to the [demo section](#usage). If you\'re interested in why it\'s a good idea to serve machine learning models via a REST API and how it can be done, just continue reading the rest of this post.\n\n*Disclaimer: Since this post discusses an application for classification of toxic language, it inevitably contains examples of toxic language that may be considered profane, vulgar, or offensive. If you do not wish to be exposed to toxic language, DO NOT proceed to read any further.*\n\nIn my [last post](http://www.pascal-bliem.com/blog/tox%20block%20using%20ai%20to%20keep%20discussions%20clean), I introduced ToxBlock, a Python machine learning application for recognizing toxic language in text. This time I want to talk about how the predictive functionality of this app can be put to use and integrated into other applications in the easiest way by serving it as a REST API, containerize the app with Docker, and deploy it to a cloud platform. Actually making use of machine learning models is something that seemed to have entered data science education only quite recently. By now, there are bunch of blog posts and tutorials and even some online courses on the deployment of machine learning models, but I remember that when I did a [data science specialization](https://www.coursera.org/specializations/data-science-python) on Coursera, they didn\'t go beyond Jupyter notebooks. But at some point, every data scientist is gonna ask herself how to deploy the model from the notebook and integrate it into other production systems.\n\nThe best strategy for deployment will of course depend on the individual case. For very low latency in predictions it might be a good idea to use streaming via a message queue or directly code the model into the app that will use it. For large batch predictions, it might be a better idea to store them in a shared data base which other applications can then query. In terms of flexibility, it is always a good compromise to serve a model via a web service in the form of a REST API (**re**presentative **s**tate **t**ransfer **a**pplication **p**rogramming **i**nterface). All predictions can be handled on-the-fly by sending HTTP requests. Pretty much every language has integrations for web technology and can communicate via the hyper text transfer protocol (HTTP), which makes it very easy to integrate the machine learning model into any other application. Let\'s elaborate on this in the next section. \n\n### Why using a REST API\n\nAs just mentioned above, it makes the integration with other applications very easy. Imagine the alternatives: You could just throw the code over to the engineer who develops an app that needs your model, which is probably gonna turn into a mess, trying to glue your code into the rest of the codebase. It may work a lot better if you wrap up your model in a self-contained package (like [`tox-block`](https://pypi.org/project/tox-block/)) that exposes the needed functionality to the outside. But what if that other app isn\'t written in Python? Maybe it\'s a Java Android app or a NodeJS web app. You\'d need some Frankenstein wrapper code to integrate your package. \n\nIt would be much easier if you\'d have some kind of contract between applications that specifies how an input should look like and what kind of output you\'ll get back - that\'s the API. Implementing it as a web service, accessible by HTTP requests, allows it to be accessed by almost any application from anywhere, as long as there is internet. The prefix REST, meaning representational state transfer, refers to the architecture style of the API. REST imposes 6 constraints on how the API has to look like to be truly RESTful, such as a uniform interface and a non-dependency and statelessness in the client-server-interaction. You can read a more detailed discussion [here](https://restfulapi.net/rest-architectural-constraints/).\n\n### Building the API with Flask and gunicorn\n\nSetting up the API is actually a fairly easy thing to do. We already have the predictive capabilities from the [`tox-block` package](https://github.com/Pascal-Bliem/tox-block) and can use the functions `tox_block.prediction.make_single_prediction` or `tox_block.prediction.make_predictions` to make predictions. Now we just have to find a way to make those available via HTTP requests. I decided to go with Flask, a micro web framework written in Python, because it is very easy and fast to set it up and it has all the functionality we need. For every Flask application, you\'d set up some config, write some routes, and create the app from it (which you can all find in Flask\'s [quick start guide](https://flask.palletsprojects.com/en/1.1.x/quickstart/)). The interesting part for us are the routes that will wrap the prediction functions. Let\'s take `make_predictions`:\n\n```python\nimport os\nfrom flask import Blueprint, request, jsonify, abort\nfrom tox_block.prediction import make_predictions\nfrom tox_block_api.validation import validate_multiple_inputs\nfrom tox_block import __version__ as _version\nfrom tox_block_api import __version__ as api_version\n\ntox_block_app = Blueprint("tox_block_app", __name__)\n\n@tox_block_app.route("/v1/make_predictions", methods=["POST"])\ndef api_make_predictions():\n if request.method == "POST":\n # Step 1: Extract POST data from request body as JSON\n input_json = request.get_json()\n \n # Step 2: Validate the input\n input_data, errors = validate_multiple_inputs(input=input_json)\n if not errors is None:\n abort(400, f"Errors occurred when validating the input data: {errors}")\n\n # Step 3: Model prediction\n prediction = make_predictions(input_texts=input_data)\n\n # Step 5: Return the response as JSON\n return jsonify({"predictions": prediction,\n "model_version": _version,\n "api_version": api_version,\n })\n```\nWe create a route to `/v1/make_predictions` that will accept HTTP POST requests with the Flask decorator `@tox_block_app.route("/v1/make_predictions", methods=["POST"])`. In the function under the decorator, we read the JSON-formatted input data from the request body, which looks something like this:\n```python\n{"input_data": ["Some texts", "to be classified"]}\n```\nThese inputs are passed to the prediction function `make_predictions`, and the predictions, together with model and API versions, are returned as JSON in the body of the HTTP response. \n\nBefore we call the prediction function, we should validate the input data which is done by `validate_multiple_inputs`:\n```python\ndef validate_multiple_inputs(input: List[str]) -> Tuple[List[str], str]:\n \n errors = None\n \n try:\n # check if JSON contains the key "input_texts"\n input = input.get("input_data", None)\n if input is None:\n raise KeyError("The key \'input_data\' was not found in the received JSON.") \n # check if input is list\n if isinstance(input,list):\n # check if list is empty\n if len(input) == 0:\n raise ValueError("Passed an empty list.")\n # check if all list items are non-empty strings\n for i, item in enumerate(input):\n if not isinstance(item,str):\n raise TypeError(f"The list item at position {i} is not a string.")\n if item == "":\n raise ValueError(f"The list item at position {i} is an empty string.")\n else:\n raise TypeError("The passed object is not a list of strings.") \n except (ValueError, TypeError, KeyError) as exc:\n errors = str(exc) \n \n return input, errors\n```\nThis function checks if the input is a list with only non-empty string elements. Any validation errors will be returned and caught by\n```python\nif not errors is None:\n abort(400, f"Errors occurred when validating the input data: {errors}")\n```\nwhich will cause a HTTP response of type `400 Bad request` with the error message in its body.\n\nThat\'s basically all there is to it. You can find all the source code and some usage examples in the `tox-block-api` [repo on Github](https://github.com/Pascal-Bliem/tox-block-api).\n\nNow, Flask itself does come with a very simple built-in web server, but it should only be used for development purpose. You can instead run the Flask app on [gunicorn](https://gunicorn.org/), which is a Python WSGI HTTP Server for UNIX. Gunicorn will allow to handle a high volume of traffic and can handle requests in parallel. It internally handles calling of the Flask code by potentially having parallel workers ready to handle requests, whereas the build-in Flask server only handles requests sequentially. Having gunicorn installed and the entry point to the Flask app in some kind of `run.py` file, you can run Flask on gunicorn in a production setting with just two command lines:\n```bash\n#!/usr/bin/env bash\nexport IS_DEBUG=${DEBUG:-false}\nexec gunicorn --bind 0.0.0.0:$PORT --access-logfile - --error-logfile - run:application\n```\n\n### Packing everything up in a Docker container\n\nWhat is a container and why bother using it? Probably everyone who has worked with Python knows how annoying dependencies can be, even if you use virtual environment management tools like [venv](https://docs.python.org/3/library/venv.html) or [conda](https://docs.conda.io/en/latest/). Sometimes not all dependencies are available on one package index, you\'ll start using both `conda` and `pip` install and they don\'t keep track of each other, some version requirements are conflicting an you\'re unsure if it\'s going to work anyway, yadda yadda yadda. Even if you get all dependencies installed with their right versions, sometimes unexpected things happen when you switch from one operating system to another. \n\nThis is where [Docker](https://www.docker.com/) containers come in really handy. They wrap up the entire application, with everything it needs to run, into one entity that can be executed on a runtime (the Docker runtime in this case). No matter where you want to run the application, as long as you have Docker, you can run the container because it already includes everything it needs to run. Containers are more light weight than virtual machines (VM). They don\'t each need a hypervisor-controlled guest operating system that requires a fixed amount of resources, but run on the Docker engine which itself runs on the host operating system (as sketched in the figure below). Hence, containers are more flexible in resource allocation, they share a single kernel and can share application libraries. They generally have a lower system overhead compared to VMs, resulting in better performance and faster launch times. Finally, containers can be orchestrated to perform optimally together by systems like [Kubernetes](https://kubernetes.io/) or [Docker Swarm](https://docs.docker.com/engine/swarm/).\n\n![A comparison of Docker containers and virtual machines.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/tox-block/docker-vm.png)\n\nOnce Docker is [installed](https://docs.docker.com/get-docker/), creating a container image is fairly easy. All commands need to be included in a [`Dockerfile`](https://docs.docker.com/engine/reference/builder/). We\'ll start building on a Python image that is available on [Docker hub](https://hub.docker.com/), a container image registry:\n```dockerfile\n# inside of Dockerfile\nFROM python:3.7.0\n```\nThen we\'ll create a user to run the app, set the working directory fo the app to run in, environment variables, and copy everything from the local directory into the containers working directory:\n```dockerfile\nRUN adduser --disabled-password --gecos \'\' tox-block-api-user\nWORKDIR /opt/tox_block_api\nENV FLASK_APP run.py\nADD ./ /opt/tox_block_api/\n```\nThen we install all the requirements we need:\n```dockerfile\nRUN pip install --upgrade pip\nRUN pip install -r /opt/tox_block_api/requirements.txt\n```\nMake the shell script that\'ll start the Flask app on gunicorn executable and set ownership for our user:\n```dockerfile\nRUN chmod +x /opt/tox_block_api/run.sh\nRUN chown -R tox-block-api-user:tox-block-api-user ./\nUSER tox-block-api-user\n```\nAnd finally, expose a port and run the application:\n```dockerfile\nEXPOSE 5000\nCMD ["bash", "./run.sh"]\n```\nThat\'s the whole `Dockerfile`. Now we can build the Docker image and, from it, launch a docker container:\n```bash\n$ docker build -t tox_block_api:latest\n$ docker run --named tox_block_api -d -p 5000:5000 -rm tox_block_api:latest\n```\n\nMost cloud providers have their own container image registries to which one can push Docker images. I deployed this project on [Render](https://www.render.com/). With Render, it\'s very easy to deploy Docker containers by pushing the image to their registry and then releasing it as a web app.\n\n### Trying out the ToxBlock API\n\n\nOkay, demo time. Feel free to follow along and test the ToxBlock API yourself, but please keep in mind: the app is running on one of Render\'s free instances, which is for development purpose only. It won\'t be able to handle a lot of traffic and if I notice that it is being used heavily, I\'ll be forced to take it down or introduce authentication. If you want to integrate it into one of your apps, you can easily set it up yourself. It is fully open-source and you can find the usage instructions in the `README.md` of its [repo](https://github.com/Pascal-Bliem/tox-block-api). Now, first of all we\'ll need something that can send HTTP requests. For API testing, I usually use [Postman](https://www.postman.com/), but if you don\'t want to install anything locally or sign up anywhere, you can use web-based API testing services, e.g. [reqbin.com](https://reqbin.com/). \n\nLet\'s check if the web service is up and running by sending a GET request to [`tox-block-api.onrender.com/health`](https://tox-block-api.onrender.com/health). You should get get back response saying 200 ok. Render\'s free instances go into a sleep mode if they\'re idle, so it may take a few seconds to wake it up after sending the request. You can also check the version of the deep learning model and the API with a GET request to [`tox-block-api.onrender.com/version`](https://tox-block-api.onrender.com/version). At the time of writing this post, this should return a JSON like this:\n```python\n{\n"api_version": "0.1.0",\n"model_version": "0.1.2"\n}\n```\nOkay, if that has worked, we can now try to classify some potentially toxic text. Predictions for single strings of text can me made via sending a POST request to the endpoint [`tox-block-api.onrender.com/v1/make_single_prediction`]. The request\'s body should contain JSON data with the key `input_data` and string as value:\n```python\n{\n "input_data": "I will kill you, you f***king idiot!"\n}\n```\nYou should get back status 200 and a JSON looking like\n```python\n{\n "api_version": "0.1.0",\n "model_version": "0.1.2",\n "predictions": {\n "identity_hate": 0.1710592806339264,\n "insult": 0.9883397221565247,\n "obscene": 0.9885633587837219,\n "severe_toxic": 0.7870364189147949,\n "text": "I will kill you, you f***king idiot!",\n "threat": 0.8483908176422119,\n "toxic": 0.9998680353164673\n }\n}\n```\nSimilarly, predictions for multiple strings of text can me made via sending a POST request to the endpoint [`tox-block-api.onrender.com/v1/make_predictions`]. The request\'s body should contain JSON data with the key `input_data` and a list of strings as value:\n```python\n{\n "input_data": ["Good morning my friend, I hope you\'re having a fantastic day!",\n "I will kill you, you f***king idiot!"]\n}\n```\nThe response will contain a JSON with the machine learning model and API version, and for each input element, the original text, and the predicted probabilities for each category of toxicity:\n```python\n{\n "api_version": "0.1.0",\n "model_version": "0.1.2",\n "predictions": {\n "0": {\n "identity_hate": 0.0021067343186587095,\n "insult": 0.00757843442261219,\n "obscene": 0.004466842859983444,\n "severe_toxic": 0.0006274481420405209,\n "text": "Good morning my friend, I hope you\'re having a fantastic day!",\n "threat": 0.009578478522598743,\n "toxic": 0.05347811430692673\n },\n "1": {\n "identity_hate": 0.17105941474437714,\n "insult": 0.9883397221565247,\n "obscene": 0.9885633587837219,\n "severe_toxic": 0.7870364785194397,\n "text": "I will kill you, you f***king idiot!",\n "threat": 0.8483907580375671,\n "toxic": 0.9998679757118225\n }\n }\n}\n```\nIn case your input doesn\'t have the right format you should get a response saying 400 Bad request, along with an error message. For example, if your JSON doesn\'t contain the key `"input_data"`, you\'ll get \n```html\n\n400 Bad Request\n

Bad Request

\n

Errors occurred when validating the input data: "The key \'input_data\' was not found in the received JSON."

\n```\nor if you pass something that cannot be interpreted a string, e.g. `{"input_data": 123456}`, you will get\n```html\n\n400 Bad Request\n

Bad Request

\n

Errors occurred when validating the input data: The passed object is not a string.

\n```\n\nAnd that\'s about it. I hope I was able to bring across why it\'s a good idea to serve machine learning models as a containerized application via a REST API and how to do so. Please tell me if you end up using **ToxBlock** or the **ToxBlock API** within one of your own projects. Thanks for reading!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("ToxBlock: using AI to keep discussions clean","A deep learning application that recognizes verbal toxicity in text",new Date("2020-06-15"),"https://wp.stanforddaily.com/wp-content/uploads/2018/03/AL.030818.Clickbait.crop_-1.jpg","Cut the bulls%@$t !",["Data Science & AI/ML"],"**TL;DR:** I built an open-source Python package called `tox-block` which can classify English text into six different categories of verbal toxicity. You can download it from the Python package index ([PyPI](https://pypi.org/project/tox-block/)) with `pip install tox-block` and find some usage examples in its [repo on Github](https://github.com/Pascal-Bliem/tox-block). If you're interested in the motivation and story behind it, just keep on reading.\n\n*Disclaimer: Since this is a post on classification of toxic language, it inevitably contains examples of toxic language that may be considered profane, vulgar, or offensive. If you do not wish to be exposed to toxic language, DO NOT proceed to read any further.*\n\n### Why care about toxic language?\n\nThere is no place like the internet when it comes to observing what the Germans call *verrohung der Gesellschaft*, which roughly translates to *brutalization of society*. Being caught in confrontation-lacking echo bubbles and hidden behind the anonymity of a screen and a fake user name, people can say very nasty things when they suddenly find themselves confronted with options or views that differ form their own. While some social networking companies thrive on the user engagement that comes with dirty verbal fights, most moderate users suffer. And not just on a level of personal sentiment. Extreme verbal toxicity scares away moderate users and effectively shuts down any proper discussion or exchange of opinion; a process that is absolutely crucial for the development of educated and fact-based opinions and viewpoints in an open and free society.\n\nSo what can be done to foster reasonable discussions on social media or in the comment sections of news sites, blogs, and video streaming platforms? Of course, toxic contents can usually be reported to the platform hosts (in many countries even to the police) by other users, but that usually requires a manual review by the hosts' staff; a task for which many small publishers simply don't have the necessary capacities. Larger companies can often outsource this tasks to low-wage countries, but that doesn't necessarily speed up the review process and, frankly, it's not a nice job to have. I have personally talked to so-called *content moderators* in the Philippines and Thailand, and they have confirmed that seeing the darkest parts of the web all day long is impacting their mental health. No perfect solution, by far.\n\nBut hey, we live in the age of deep learning. Shouldn't AI be able to take care of this by know? In fact, sentiment classification is a classic task in the field of natural language processing (NLP) and I suppose that classifying toxicity falls somewhere along those lines. About two years ago in spring 2018, Kaggle concluded a [competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview) on toxic comment classification hosted by Conversation AI / Jigsaw. They provided labeled data of about 250000 comments from Wikipedia articles of which about 10% contained some type of the toxicity categories toxic, severe toxic, obscene, insult, threat, and identity hate. Many of the competitors managed to build very well performing models, so I thought this may be a great chance for me to learn a thing or two about NLP and maybe end up building something interesting or even useful.\n\nThe next two section will go deeper into what can be done with the given data and how the neural network model behind ToxBlock works. If you're less interested into getting lost in the technical details and more into finding out how you can actually use ToxBlock, feel free to fast-forward to the [corresponding section](#usage).\n\n### Taking a look at the data\n\nThe [data set](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data) I used already came in a very usable form; columnar data with the comment text the the respective labels. Each comment could have multiple labels if belonging to multiple categories of toxicity. The classes are distributed quite unevenly, which can make it more challenging for a machine learning algorithm to recognize instances of the minority classes. Only about 10% of the comments in the data set belong to any of the toxic classes. Also among those classes the distribution is not balanced, as can be seen in the figure below.\n\n\"Distribution\n\nIf we look at the 10 most commonly appearing words in toxic comments (I've censored them here to not have my site flagged as NSFW by search engines), some are a bit of a surprise:\n\n- fu***k\n- like\n- sh***t\n- wikipedia\n- ni***er\n- fu***king\n- suck\n- ass\n- hate\n- u\n\nI really hope my model won't end up recognizing the word *Wikipedia* as an insult.\nLet's randomly pick a non-toxic and a toxic comment to get an idea for how they look like. \n\nA non-toxic comment: \n\" Main Page \n\nI think whoever is really writing this article should try to get it featured on the main page before the election, because after the election who cares? \" \n\nA toxic comment: \n\" Cu###ts\n\nWay to discuss, eh? Protect the article and then fu###k it up. Despicable. Well if you're the kind of people who think there is some special meaning to being blown up in a Montego, I haven't got the fu###king time to argue. 190.46.108.141\" \n\nWe can see in the toxic example that it contains the user's IP address. Besides of IPs some of the comments also contain user names and hyperlinks - all features that should be removed because they are not useful for general text classification and can even cause data leakage from the training into the test set, in case the same users show up in both sets. \n\nBesides dataset-specific cleaning, there are some common an general preprocessing strategies for text data in NLP. As in any machine learning problem, one wants to maximize the signal to noise ratio. For text that means that there is a lot that we can sort out before before we train a model or make inferences. In most cases, punctuation and numbers don't play a huge role in classifying the meaning in text, hence, they're often removed. But there are also regular words that used so frequently that they rarely contribute much information. Such words are called stop words and are often removed in NLP preprocessing. In English those could be words such as personal pronouns (\"I, you, he\"), \"otherwise\", \"regarding\", \"again\", \"into\", \"further\", you get the idea. You can find a collection of typical stop words for over 40 languages [here](https://www.ranks.nl/stopwords). It sometime makes sense to summarize different word with a similar or same word stem into the same word if they convey the same meaning. This is called *text normalization* and two popular methods are called *stemming* and *lemmatization*. Stemming or lemmatization reduce inflection in words to their root forms, with the difference that lemmatization produces stems that actually exist in the language. All these NLP task are readily implemented in NLP Python packages like `spacy` or `nltk`, but I actually ended up using none of them in the final model. The performance gain resulting from these methods was so small that I decided to go for less computations and, hence, faster inference.\n\n### How the model works\n\nBefore deep learning became so popular, features were usually extracted from text by *tokenizing* the text of each document into words or [n-grams](https://en.wikipedia.org/wiki/N-gram) and creating some kind of token frequency matrix (e.g. by count vectorization or [TI-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) vectorization). This would then be used as the input for some classifying algorithm. In the aforementioned Kaggle competition, a combination of naive Bayes and linear support vector machine classifiers, as described by [Wang and Manning](https://github.com/sidaw/nbsvm/blob/master/wang12simple.pdf), performed very well. The other go-to approach is to rely on deep learning, or recurrent neural networks to be more specific. These type of networks are suited for processing sequence data, as their units possess an internal state that can be passed from one unit to the next. As text can be considered a sequence of words, and the meaning of one part of a sentence probably depends on another part of the sentence (or sequence), recurrent neural networks are great for NLP tasks. The two most popular types from this family are called gated recurrent units (GRU) or long short-term memory (LSTM) networks. I went for the latter. If I'd try to describe the whole model in one gibberish sentence it would be something like: tokenize and sequentialize the input text, feed it into a pre-trained embedding layer, then into a bidirectional LSTM layer, use global 1D max-pooling, then into a fully-connected layer, add some dropout, into a final fully-connected layer with Sigmoid activations, and train the whole thing with its 1,045,756 parameters.\n\nBut what the heck does that mean? Let's look at it step by step. First we have to tokenize the sequences of text and turn them into a numerical form so that an algorithm can process them. Imagine we have a collection of documents, e.g. short sentences:\n```python\n[\"These are not the droids you are looking for.\",\n \"Actually, I think these are exactly the droids we are looking for.\"]\n```\nNow we take each document apart word by word, and turn each word into a number. Of course, the same word needs to be assigned the same number across all documents:\n```python\n[\n [2, 1, 7, 3, 4, 8, 1, 5, 6], \n [9, 10, 11, 2, 1, 12, 3, 4, 13, 1, 5, 6]\n]\n```\nFinally, we need cut off or pad these numerical sequences with zeroes so that they all have the same length:\n```python\n[\n [ 0, 0, 0, 0, 0, 0, 2, 1, 7, 3, 4, 8, 1, 5, 6],\n [ 0, 0, 0, 9, 10, 11, 2, 1, 12, 3, 4, 13, 1, 5, 6]\n]\n```\nNow these sequences can be fed into an embedding layer. The idea behind word embeddings is to encode the meaning of words as vectors in high dimensional space. Word vectors of words with a similar meaning (e.g. bee, wasp, bumblebee, hornet) should be close together in this space, vectors of words that have nothing to do with each other should be far apart, and vectors of words with opposite meanings like \"front\" or \"back\" should ideally pointing into the opposite direction. If you would subtract the \"male\" vector from the word \"king\" and add the \"female\" vector, you should end up somewhere close to the word \"queen\" (as sketched in the figure below). To train such embeddings well, one usually needs **a lot** of text; hence, it often makes sense to initialize the parameters of the embedding layer with pre-trained embeddings and then fine-tune them on the given task. There are many pre-trained embeddings available such as [ELMo](https://allennlp.org/elmo), [BERT](https://pypi.org/project/bert-embedding/), [Word2vec](https://code.google.com/archive/p/word2vec/), or [fastText](https://fasttext.cc/); I decided to go with [GloVe](https://nlp.stanford.edu/projects/glove/) which has been pre-trained on the Wikipedia and Gigaword 5 corpora. \n\n![Word embeddings encode the meaning of word into n-dimensional vectors.](https://miro.medium.com/max/3010/1*sXNXYfAqfLUeiDXPCo130w.png)\n\nAfter having transformed the sequence of words into sequences of word vectors, we can pass them on to the core piece of the model: the recurrent layer. [Simple recurrent units](https://en.wikipedia.org/wiki/Recurrent_neural_network) do often have a problem of [vanishing gradients](https://en.wikipedia.org/wiki/Vanishing_gradient_problem) during training. To avoid this problem, gated recurrent neural networks, such as the LSTM, have been introduced. Besides getting a sequence as input and also outputting a sequence, a layer of LSTM units (or cells) also propagates a cell state vector from one cell to the next (see figure below). Each cell can read from it, write to it, or reset itself, and it does so by so-called gates. An LSTM unit has three of these gates called *forget*, *input*, and *output* which decide how much information of the cell state is kept, what values will be updated, and what parts of the cell state will be the output. I don't want to discuss the ins and outs of LSTMs in length here, but let me point you to this great [article](https://colah.github.io/posts/2015-08-Understanding-LSTMs/). The LSTM layer in this model is bidirectional, which means it parses the input sequences both ways, front to back and back to front. This can be very useful when dealing with natural language, as the different parts of a sentence may depend on each other in both directions (You don't see what I mean? Try learning German...we chop up verbs into several pieces and scatter them all over the sentence).\n\n![Recurrent units in an LSTM avoid the vanishing gradient problem by using gates to learn what values of the cell state should be forgotten, updated, and outputted.](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2017/12/10131302/13.png)\n\nThe recurrent layer will output another sequence, but at this point we are going to down-sample it a bit. We apply global-max-pooling over the sequence, which means of all the steps (or n-dimensional word vectors) in the sequence, we only take the maximum value fo each dimension. That sounds a bit confusing, so let's visualize it. Imagine we have a tensor of shape (3,3,1) corresponding to (batch size, sequence steps, vector dimensions):\n```python\n[[[1.], [2.], [3.]],\n [[4.], [5.], [6.]],\n [[7.], [8.], [9.]]]\n```\nOne-dimensional global max-pooling would reduce it to a tensor of shape (3,1) corresponding to (batch size, vector dimensions):\n```python\n[[3.],\n [6.],\n [9.]]\n```\nWhy would we do this in the first place? We'll feed it into a fully-connected layer next, for which the tensors need to be flattened in some way. By global max-pooling, we can greatly reduce the dimensionality and try to \"extract signal from noise\". The argument goes something like: the sequence part of interest is likely to produce the largest value, hence, it shall be most interesting to only take the maximum value.\n\nNext stop is a fully-connected (or dense) layer. All of the values we got from the previous layer will be multiplied with specific weights (which are learned during training) and serve as inputs for all units in the fully-connected layer (as sketched in the figure below). The outputs of this layer will be passed through a [ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation and some [dropout](https://en.wikipedia.org/wiki/Dilution_(neural_networks)). In the context of neural networks, dropout means that some of the connections between the layers are randomly set to zero, which forces the network to robustly spread the information flow over more units and, hence, serves as a regularization technique to reduce over-fitting. Finally, we enter a last fully-connected layer with only six units, corresponding to the six categories of verbal toxicity that we are trying to predict. Each of their outputs is passed to a [Sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation function that forces the output between zero and one - a value that corresponds to the predicted probability of the respective category.\n\n\"In\n\n### How you can use ToxBlock\n\n\nYou can find the source code and usage examples in the [ToxBlock repo](https://github.com/Pascal-Bliem/tox-block) on Github, from where you can also clone it. To make it easily accessible, I also put `tox-block` on the Python package index ([PyPI](https://pypi.org/project/tox-block/)) from where you can download and install it via\n```python\npip install tox-block\n```\nThe methods for prediction are contained in the module `tox_block.prediction`. Predictions for single strings of text can me made via `tox_block.prediction.make_single_prediction`:\n\n```python\nfrom tox_block.prediction import make_single_prediction\n\nmake_single_prediction(\"I will beat you up, you f***king idiot!\")\n```\nIt will return a dictionary with the original text and the predicted probabilities for each category of toxicity:\n```python\n{'text': 'I will beat you up, you f***king idiot!',\n 'toxic': 0.9998680353164673,\n 'severe_toxic': 0.7870364189147949,\n 'obscene': 0.9885633587837219,\n 'threat': 0.8483908176422119,\n 'insult': 0.9883397221565247,\n 'identity_hate': 0.1710592657327652}\n```\nTo make bulk predictions for several texts, they can be passed as a list of strings `into tox_block.prediction.make_predictions`:\n```python\nfrom tox_block.prediction import make_predictions\n\nmake_predictions([\"Good morning my friend, I hope you're having a fantastic day!\",\n \"I will beat you up, you f***king idiot!\",\n \"I do strongly disagree with the fascist views of \\\n this joke that calls itself a political party.\"])\n```\nIt will return a dictionary of dictionaries of which each contains the original text and the predicted probabilities for each category of toxicity:\n```python\n{\n0: {'text': \"Good morning my friend, I hope you're having a fantastic day!\",\n 'toxic': 0.05347811430692673,\n 'severe_toxic': 0.0006274021579883993,\n 'obscene': 0.004466842859983444,\n 'threat': 0.009578478522598743,\n 'insult': 0.00757843442261219,\n 'identity_hate': 0.002106667961925268},\n 1: {'text': 'I will beat you up, you f***king idiot!',\n 'toxic': 0.9998679757118225,\n 'severe_toxic': 0.7870362997055054,\n 'obscene': 0.9885633587837219,\n 'threat': 0.8483908176422119,\n 'insult': 0.9883397221565247,\n 'identity_hate': 0.171059250831604},\n 2: {'text': 'I do strongly disagree with the fascist views of this joke that calls itself a political party.',\n 'toxic': 0.026190076023340225,\n 'severe_toxic': 7.185135473264381e-05,\n 'obscene': 0.0009493605466559529,\n 'threat': 0.00012321282702032477,\n 'insult': 0.0029190618079155684,\n 'identity_hate': 0.0022098885383456945}\n}\n```\nThat's basically the core functionality. For more details, please refer to the [repo](https://github.com/Pascal-Bliem/tox-block). How about you try to integrate this Python package into you own applications, or use it to keep your own website, forum, or blog clean from nasty language that ruins everyone elses experience? \n\nHaving a python package for toxic language recognition is cool, but maybe it's not the most elegant solution to integrate it into your code base, especially if your application is written in a language other than Python. We're on the web anyway, so we might as well handle all of this with a simple HTTP request. That's why I'm currently working on the **ToxBlock API**, a REST API that will make ToxBlock's predictive capabilities available for any other application that can wrap nasty texts in JSON and POST it to an API endpoint. Stay tuned for more and thanks for reading!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("A data set of handwritten Chinese characters","Input data for training a deep-learning optical character recognition system",new Date("2020-05-22"),"https://asiasociety.org/sites/default/files/styles/1200w/public/C/calligraphy.jpg","Let's try to find some data that will help us to decipher Chinese handwriting with ML!",["Data Science & AI/ML"],'**TL;DR:** I uploaded data sets with [handwritten](https://www.kaggle.com/pascalbliem/handwritten-chinese-character-hanzi-datasets) and [handwriting-style font](https://www.kaggle.com/pascalbliem/chinese-characters-from-handwritingstyle-fonts) Chinese characters on Kaggle, which can be used to train machine learning models for optical character recognition (OCR) on Chinese handwriting. If you\'re interested in the story behind it, just keep on reading.\n\nI am currently studying Mandarin Chinese, and as you may have guessed, it really isn\'t the easiest natural language to learn. I remember that when I was learning Indonesian, it only took me around three months to be able to have simple conversations. Now, after about 5 months of studying Chinese for a few hours per week, I am still struggling with the basics. Part of the reason is the pronunciation, which is extremely different form any language I\'ve encountered before. And then there are, of course, the Chinese characters (Hanzi \u6c49\u5b57). There are about 50000 of them, but luckily only about 3000 are used in every day language. Furthermore, the characters used nowadays in mainland China, Singapore, and Malaysia are simplified ones (\u7b80\u4f53\u5b57), whereas Taiwan, Hong Kong, and Macau still use the traditional characters (\u7e41\u4f53\u5b57). That\'s definitely enough to utterly confuse me already, and since they\'re pictograms that don\'t convey any phonetic information, it is tricky to just find them on a keyboard to type them into a translation service to look up the meaning. Google [Translate](https://translate.google.com/) has a camera-based optical character recognition (OCR) function and Chinese dictionary apps like [Pleco](https://www.pleco.com/) offer OCR services as well, but they seem to be specialized on printed fonts and do not perform as well on handwritten characters. So what am I going to do when I want to decipher the sloppily written text on a sticky note that my friend left for on the fridge for me? I may try to build my own OCR system focused on handwriting, but first of all, I would need to find some data.\n\n### What is out there already \n\nConsidering that OCR has been a typical machine learning problem for decades already, the internet isn\'t exactly flooded with Chinese character data sets, especially when it comes to handwritten ones. And when it comes to sources in English language; due to China\'s great firewall, purely Chinese digital ecosystems have developed in which it\'s hard to find even a trace of English. I tried to searching through some [Baidu](http://www.baidu.com/) file sharing platform with my lousy Chinese skills and the help of Google Translate, but couldn\'t even register an account to download things because it seemed to require a Chinese mobile phone number. So, I thought, let\'s search the places a data scientist would usually go to: [Github](https://github.com/) and [Kaggle](https://www.kaggle.com/). There are some search results popping up when screening Github for "Chinese OCR" or "Chinese text recognition", but unsurprisingly, most of them (like [this](https://github.com/Wang-Shuo/Chinese-Text-Detection-and-Recognition) one, [this](https://github.com/wycm/xuexin-ocr) one, or [this](https://github.com/chineseocr/chineseocr) one) are written in Chinese (I should really put more effort in getting more fluent in it quickly). Most of them didn\'t seem to focus on handwritten characters anyway. One of the few things I found on Github in English was an amazing *[Chinese Text in the Wild](https://ctwdataset.github.io/)* data set with annotated images of occurrences of Chinese text from the perspective of a car-mounted camera (see image below); which unfortunately isn\'t about handwriting either.\n\n![An example of Chinese Text in the Wild form a car\'s perspective.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/hanzi-dataset/textinthewild.png)\n\nKaggle was going to be my next stop. The only existing [data set](https://www.kaggle.com/dylanli/chinesecharacter) I found there was a collection of Chinese fonts files, which could be used for generating images of characters from the fonts. It only contained 17 fonts though, meaning only 17 images per character (without image augmentation), plus maybe 9 more which I could find on [Google Fonts](https://fonts.google.com/?subset=chinese-simplified). That hardly seemed enough. But the idea wasn\'t a bad one: generating images from fonts. I came across a [blog post](https://blog.usejournal.com/making-of-a-chinese-characters-dataset-92d4065cc7cc) of Peter Burkimsher, who took a very ambitious take on this idea. He did not just care about maybe the 3000 most commonly used characters or even the 7330 characters in the GB2312 encoding, but he created 15 million image files of 52,835 characters. I downloaded the compressed archive which he kindly provided, but I had to give up before I could even finish to unpack it because it blew up my hard drive. Again, most of the fonts didn\'t resemble handwriting anyway, but his post pointed me towards another great resource: a website called [chinesefontdesign.com](https://chinesefontdesign.com/).\n\n### Creating Chinese character images form fonts files\n\nI browsed said website for handwriting-style fonts. Some manual selection was necessary here because some of the fonts were only for traditional characters (I was looking for the simplified ones only) and others labeled as handwriting-style actually looked nothing like human handwriting. I managed to find around 120 fonts files which I could use for generating images. Python\'s `PIL` package has some useful functionality for doing so. Here\'s a little code snipped that I used:\n```python\n# libraries for image processing\nfrom PIL import Image, ImageDraw, ImageFont, ImageChops\nimport cv2\n# libraries for I/O\nimport os\nimport glob\n\n# some common simplified Chinese characters and alphanumerics\ncharset = \'0123456789QWERTYUIOPASDFGHJKLZXCVBNMqwertzuiopasdfghjklyxcvbnm.:;-+!$#%&@\u201c\u201d\u300a\u300b\u7684\u4e00\u662f\u4e0d\u4e86\u5728\u4eba\u6709\u6211\u4ed6\u8fd9\u4e2a\u4eec\u4e2d\u6765\u4e0a\u5927\u4e3a\u548c\u56fd\u5730\u5230\u4ee5\u8bf4\u65f6\u8981\u5c31\u51fa\u4f1a\u53ef\u4e5f\u4f60\u5bf9\u751f\u80fd\u800c\u5b50\u90a3\u5f97\u4e8e\u7740\u4e0b\u81ea\u4e4b\u5e74\u8fc7\u53d1\u540e\u4f5c\u91cc\u7528\u9053\u884c\u6240\u7136\u5bb6\u79cd\u4e8b\u6210\u65b9\u591a\u7ecf\u4e48\u53bb\u6cd5\u5b66\u5982\u90fd\u540c\u73b0\u5f53\u6ca1\u52a8\u9762\u8d77\u770b\u5b9a\u5929\u5206\u8fd8\u8fdb\u597d\u5c0f\u90e8\u5176\u4e9b\u4e3b\u6837\u7406\u5fc3\u5979\u672c\u524d\u5f00\u4f46\u56e0\u53ea\u4ece\u60f3\u5b9e\u65e5\u519b\u8005\u610f\u65e0\u529b\u5b83\u4e0e\u957f\u628a\u673a\u5341\u6c11\u7b2c\u516c\u6b64\u5df2\u5de5\u4f7f\u60c5\u660e\u6027\u77e5\u5168\u4e09\u53c8\u5173\u70b9\u6b63\u4e1a\u5916\u5c06\u4e24\u9ad8\u95f4\u7531\u95ee\u5f88\u6700\u91cd\u5e76\u7269\u624b\u5e94\u6218\u5411\u5934\u6587\u4f53\u653f\u7f8e\u76f8\u89c1\u88ab\u5229\u4ec0\u4e8c\u7b49\u4ea7\u6216\u65b0\u5df1\u5236\u8eab\u679c\u52a0\u897f\u65af\u6708\u8bdd\u5408\u56de\u7279\u4ee3\u5185\u4fe1\u8868\u5316\u8001\u7ed9\u4e16\u4f4d\u6b21\u5ea6\u95e8\u4efb\u5e38\u5148\u6d77\u901a\u6559\u513f\u539f\u4e1c\u58f0\u63d0\u7acb\u53ca\u6bd4\u5458\u89e3\u6c34\u540d\u771f\u8bba\u5904\u8d70\u4e49\u5404\u5165\u51e0\u53e3\u8ba4\u6761\u5e73\u7cfb\u6c14\u9898\u6d3b\u5c14\u66f4\u522b\u6253\u5973\u53d8\u56db\u795e\u603b\u4f55\u7535\u6570\u5b89\u5c11\u62a5\u624d\u7ed3\u53cd\u53d7\u76ee\u592a\u91cf\u518d\u611f\u5efa\u52a1\u505a\u63a5\u5fc5\u573a\u4ef6\u8ba1\u7ba1\u671f\u5e02\u76f4\u5fb7\u8d44\u547d\u5c71\u91d1\u6307\u514b\u8bb8\u7edf\u533a\u4fdd\u81f3\u961f\u5f62\u793e\u4fbf\u7a7a\u51b3\u6cbb\u5c55\u9a6c\u79d1\u53f8\u4e94\u57fa\u773c\u4e66\u975e\u5219\u542c\u767d\u5374\u754c\u8fbe\u5149\u653e\u5f3a\u5373\u50cf\u96be\u4e14\u6743\u601d\u738b\u8c61\u5b8c\u8bbe\u5f0f\u8272\u8def\u8bb0\u5357\u54c1\u4f4f\u544a\u7c7b\u6c42\u636e\u7a0b\u5317\u8fb9\u6b7b\u5f20\u8be5\u4ea4\u89c4\u4e07\u53d6\u62c9\u683c\u671b\u89c9\u672f\u9886\u5171\u786e\u4f20\u5e08\u89c2\u6e05\u4eca\u5207\u9662\u8ba9\u8bc6\u5019\u5e26\u5bfc\u4e89\u8fd0\u7b11\u98de\u98ce\u6b65\u6539\u6536\u6839\u5e72\u9020\u8a00\u8054\u6301\u7ec4\u6bcf\u6d4e\u8f66\u4eb2\u6781\u6797\u670d\u5feb\u529e\u8bae\u5f80\u5143\u82f1\u58eb\u8bc1\u8fd1\u5931\u8f6c\u592b\u4ee4\u51c6\u5e03\u59cb\u600e\u5462\u5b58\u672a\u8fdc\u53eb\u53f0\u5355\u5f71\u5177\u7f57\u5b57\u7231\u51fb\u6d41\u5907\u5175\u8fde\u8c03\u6df1\u5546\u7b97\u8d28\u56e2\u96c6\u767e\u9700\u4ef7\u82b1\u515a\u534e\u57ce\u77f3\u7ea7\u6574\u5e9c\u79bb\u51b5\u4e9a\u8bf7\u6280\u9645\u7ea6\u793a\u590d\u75c5\u606f\u7a76\u7ebf\u4f3c\u5b98\u706b\u65ad\u7cbe\u6ee1\u652f\u89c6\u6d88\u8d8a\u5668\u5bb9\u7167\u987b\u4e5d\u589e\u7814\u5199\u79f0\u4f01\u516b\u529f\u5417\u5305\u7247\u53f2\u59d4\u4e4e\u67e5\u8f7b\u6613\u65e9\u66fe\u9664\u519c\u627e\u88c5\u5e7f\u663e\u5427\u963f\u674e\u6807\u8c08\u5403\u56fe\u5ff5\u516d\u5f15\u5386\u9996\u533b\u5c40\u7a81\u4e13\u8d39\u53f7\u5c3d\u53e6\u5468\u8f83\u6ce8\u8bed\u4ec5\u8003\u843d\u9752\u968f\u9009\u5217\u6b66\u7ea2\u54cd\u867d\u63a8\u52bf\u53c2\u5e0c\u53e4\u4f17\u6784\u623f\u534a\u8282\u571f\u6295\u67d0\u6848\u9ed1\u7ef4\u9769\u5212\u654c\u81f4\u9648\u5f8b\u8db3\u6001\u62a4\u4e03\u5174\u6d3e\u5b69\u9a8c\u8d23\u8425\u661f\u591f\u7ae0\u97f3\u8ddf\u5fd7\u5e95\u7ad9\u4e25\u5df4\u4f8b\u9632\u65cf\u4f9b\u6548\u7eed\u65bd\u7559\u8bb2\u578b\u6599\u7ec8\u7b54\u7d27\u9ec4\u7edd\u5947\u5bdf\u6bcd\u4eac\u6bb5\u4f9d\u6279\u7fa4\u9879\u6545\u6309\u6cb3\u7c73\u56f4\u6c5f\u7ec7\u5bb3\u6597\u53cc\u5883\u5ba2\u7eaa\u91c7\u4e3e\u6740\u653b\u7236\u82cf\u5bc6\u4f4e\u671d\u53cb\u8bc9\u6b62\u7ec6\u613f\u5343\u503c\u4ecd\u7537\u94b1\u7834\u7f51\u70ed\u52a9\u5012\u80b2\u5c5e\u5750\u5e1d\u9650\u8239\u8138\u804c\u901f\u523b\u4e50\u5426\u521a\u5a01\u6bdb\u72b6\u7387\u751a\u72ec\u7403\u822c\u666e\u6015\u5f39\u6821\u82e6\u521b\u5047\u4e45\u9519\u627f\u5370\u665a\u5170\u8bd5\u80a1\u62ff\u8111\u9884\u8c01\u76ca\u9633\u82e5\u54ea\u5fae\u5c3c\u7ee7\u9001\u6025\u8840\u60ca\u4f24\u7d20\u836f\u9002\u6ce2\u591c\u7701\u521d\u559c\u536b\u6e90\u98df\u9669\u5f85\u8ff0\u9646\u4e60\u7f6e\u5c45\u52b3\u8d22\u73af\u6392\u798f\u7eb3\u6b22\u96f7\u8b66\u83b7\u6a21\u5145\u8d1f\u4e91\u505c\u6728\u6e38\u9f99\u6811\u7591\u5c42\u51b7\u6d32\u51b2\u5c04\u7565\u8303\u7adf\u53e5\u5ba4\u5f02\u6fc0\u6c49\u6751\u54c8\u7b56\u6f14\u7b80\u5361\u7f6a\u5224\u62c5\u5dde\u9759\u9000\u65e2\u8863\u60a8\u5b97\u79ef\u4f59\u75db\u68c0\u5dee\u5bcc\u7075\u534f\u89d2\u5360\u914d\u5f81\u4fee\u76ae\u6325\u80dc\u964d\u9636\u5ba1\u6c89\u575a\u5584\u5988\u5218\u8bfb\u554a\u8d85\u514d\u538b\u94f6\u4e70\u7687\u517b\u4f0a\u6000\u6267\u526f\u4e71\u6297\u72af\u8ffd\u5e2e\u5ba3\u4f5b\u5c81\u822a\u4f18\u602a\u9999\u8457\u7530\u94c1\u63a7\u7a0e\u5de6\u53f3\u4efd\u7a7f\u827a\u80cc\u9635\u8349\u811a\u6982\u6076\u5757\u987f\u6562\u5b88\u9152\u5c9b\u6258\u592e\u6237\u70c8\u6d0b\u54e5\u7d22\u80e1\u6b3e\u9760\u8bc4\u7248\u5b9d\u5ea7\u91ca\u666f\u987e\u5f1f\u767b\u8d27\u4e92\u4ed8\u4f2f\u6162\u6b27\u6362\u95fb\u5371\u5fd9\u6838\u6697\u59d0\u4ecb\u574f\u8ba8\u4e3d\u826f\u5e8f\u5347\u76d1\u4e34\u4eae\u9732\u6c38\u547c\u5473\u91ce\u67b6\u57df\u6c99\u6389\u62ec\u8230\u9c7c\u6742\u8bef\u6e7e\u5409\u51cf\u7f16\u695a\u80af\u6d4b\u8d25\u5c4b\u8dd1\u68a6\u6563\u6e29\u56f0\u5251\u6e10\u5c01\u6551\u8d35\u67aa\u7f3a\u697c\u53bf\u5c1a\u6beb\u79fb\u5a18\u670b\u753b\u73ed\u667a\u4ea6\u8033\u6069\u77ed\u638c\u6050\u9057\u56fa\u5e2d\u677e\u79d8\u8c22\u9c81\u9047\u5eb7\u8651\u5e78\u5747\u9500\u949f\u8bd7\u85cf\u8d76\u5267\u7968\u635f\u5ffd\u5de8\u70ae\u65e7\u7aef\u63a2\u6e56\u5f55\u53f6\u6625\u4e61\u9644\u5438\u4e88\u793c\u6e2f\u96e8\u5440\u677f\u5ead\u5987\u5f52\u775b\u996d\u989d\u542b\u987a\u8f93\u6447\u62db\u5a5a\u8131\u8865\u8c13\u7763\u6bd2\u6cb9\u7597\u65c5\u6cfd\u6750\u706d\u9010\u83ab\u7b14\u4ea1\u9c9c\u8bcd\u5723\u62e9\u5bfb\u5382\u7761\u535a\u52d2\u70df\u6388\u8bfa\u4f26\u5cb8\u5965\u5510\u5356\u4fc4\u70b8\u8f7d\u6d1b\u5065\u5802\u65c1\u5bab\u559d\u501f\u541b\u7981\u9634\u56ed\u8c0b\u5b8b\u907f\u6293\u8363\u59d1\u5b59\u9003\u7259\u675f\u8df3\u9876\u7389\u9547\u96ea\u5348\u7ec3\u8feb\u7237\u7bc7\u8089\u5634\u9986\u904d\u51e1\u7840\u6d1e\u5377\u5766\u725b\u5b81\u7eb8\u8bf8\u8bad\u79c1\u5e84\u7956\u4e1d\u7ffb\u66b4\u68ee\u5854\u9ed8\u63e1\u620f\u9690\u719f\u9aa8\u8bbf\u5f31\u8499\u6b4c\u5e97\u9b3c\u8f6f\u5178\u6b32\u8428\u4f19\u906d\u76d8\u7238\u6269\u76d6\u5f04\u96c4\u7a33\u5fd8\u4ebf\u523a\u62e5\u5f92\u59c6\u6768\u9f50\u8d5b\u8da3\u66f2\u5200\u5e8a\u8fce\u51b0\u865a\u73a9\u6790\u7a97\u9192\u59bb\u900f\u8d2d\u66ff\u585e\u52aa\u4f11\u864e\u626c\u9014\u4fb5\u5211\u7eff\u5144\u8fc5\u5957\u8d38\u6bd5\u552f\u8c37\u8f6e\u5e93\u8ff9\u5c24\u7ade\u8857\u4fc3\u5ef6\u9707\u5f03\u7532\u4f1f\u9ebb\u5ddd\u7533\u7f13\u6f5c\u95ea\u552e\u706f\u9488\u54f2\u7edc\u62b5\u6731\u57c3\u62b1\u9f13\u690d\u7eaf\u590f\u5fcd\u9875\u6770\u7b51\u6298\u90d1\u8d1d\u5c0a\u5434\u79c0\u6df7\u81e3\u96c5\u632f\u67d3\u76db\u6012\u821e\u5706\u641e\u72c2\u63aa\u59d3\u6b8b\u79cb\u57f9\u8ff7\u8bda\u5bbd\u5b87\u731b\u6446\u6885\u6bc1\u4f38\u6469\u76df\u672b\u4e43\u60b2\u62cd\u4e01\u8d75\u786c\u9ea6\u848b\u64cd\u8036\u963b\u8ba2\u5f69\u62bd\u8d5e\u9b54\u7eb7\u6cbf\u558a\u8fdd\u59b9\u6d6a\u6c47\u5e01\u4e30\u84dd\u6b8a\u732e\u684c\u5566\u74e6\u83b1\u63f4\u8bd1\u593a\u6c7d\u70e7\u8ddd\u88c1\u504f\u7b26\u52c7\u89e6\u8bfe\u656c\u54ed\u61c2\u5899\u88ad\u53ec\u7f5a\u4fa0\u5385\u62dc\u5de7\u4fa7\u97e9\u5192\u503a\u66fc\u878d\u60ef\u4eab\u6234\u7ae5\u72b9\u4e58\u6302\u5956\u7ecd\u539a\u7eb5\u969c\u8baf\u6d89\u5f7b\u520a\u4e08\u7206\u4e4c\u5f79\u63cf\u6d17\u739b\u60a3\u5999\u955c\u5531\u70e6\u7b7e\u4ed9\u5f7c\u5f17\u75c7\u4eff\u503e\u724c\u9677\u9e1f\u8f70\u54b1\u83dc\u95ed\u594b\u5e86\u64a4\u6cea\u8336\u75be\u7f18\u64ad\u6717\u675c\u5976\u5b63\u4e39\u72d7\u5c3e\u4eea\u5077\u5954\u73e0\u866b\u9a7b\u5b54\u5b9c\u827e\u6865\u6de1\u7ffc\u6068\u7e41\u5bd2\u4f34\u53f9\u65e6\u6108\u6f6e\u7cae\u7f29\u7f62\u805a\u5f84\u6070\u6311\u888b\u7070\u6355\u5f90\u73cd\u5e55\u6620\u88c2\u6cf0\u9694\u542f\u5c16\u5fe0\u7d2f\u708e\u6682\u4f30\u6cdb\u8352\u507f\u6a2a\u62d2\u745e\u5fc6\u5b64\u9f3b\u95f9\u7f8a\u5446\u5389\u8861\u80de\u96f6\u7a77\u820d\u7801\u8d6b\u5a46\u9b42\u707e\u6d2a\u817f\u80c6\u6d25\u4fd7\u8fa9\u80f8\u6653\u52b2\u8d2b\u4ec1\u5076\u8f91\u90a6\u6062\u8d56\u5708\u6478\u4ef0\u6da6\u5806\u78b0\u8247\u7a0d\u8fdf\u8f86\u5e9f\u51c0\u51f6\u7f72\u58c1\u5fa1\u5949\u65cb\u51ac\u77ff\u62ac\u86cb\u6668\u4f0f\u5439\u9e21\u500d\u7cca\u79e6\u76fe\u676f\u79df\u9a91\u4e4f\u9686\u8bca\u5974\u6444\u4e27\u6c61\u6e21\u65d7\u7518\u8010\u51ed\u624e\u62a2\u7eea\u7c97\u80a9\u6881\u5e7b\u83f2\u7686\u788e\u5b99\u53d4\u5ca9\u8361\u7efc\u722c\u8377\u6089\u8482\u8fd4\u4e95\u58ee\u8584\u6084\u626b\u654f\u788d\u6b96\u8be6\u8fea\u77db\u970d\u5141\u5e45\u6492\u5269\u51ef\u9897\u9a82\u8d4f\u6db2\u756a\u7bb1\u8d34\u6f2b\u9178\u90ce\u8170\u8212\u7709\u5fe7\u6d6e\u8f9b\u604b\u9910\u5413\u633a\u52b1\u8f9e\u8258\u952e\u4f0d\u5cf0\u5c3a\u6628\u9ece\u8f88\u8d2f\u4fa6\u6ed1\u5238\u5d07\u6270\u5baa\u7ed5\u8d8b\u6148\u4e54\u9605\u6c57\u679d\u62d6\u58a8\u80c1\u63d2\u7bad\u814a\u7c89\u6ce5\u6c0f\u5f6d\u62d4\u9a97\u51e4\u6167\u5a92\u4f69\u6124\u6251\u9f84\u9a71\u60dc\u8c6a\u63a9\u517c\u8dc3\u5c38\u8083\u5e15\u9a76\u5821\u5c4a\u6b23\u60e0\u518c\u50a8\u98d8\u6851\u95f2\u60e8\u6d01\u8e2a\u52c3\u5bbe\u9891\u4ec7\u78e8\u9012\u90aa\u649e\u62df\u6eda\u594f\u5de1\u989c\u5242\u7ee9\u8d21\u75af\u5761\u77a7\u622a\u71c3\u7126\u6bbf\u4f2a\u67f3\u9501\u903c\u9887\u660f\u529d\u5448\u641c\u52e4\u6212\u9a7e\u6f02\u996e\u66f9\u6735\u4ed4\u67d4\u4fe9\u5b5f\u8150\u5e7c\u8df5\u7c4d\u7267\u51c9\u7272\u4f73\u5a1c\u6d53\u82b3\u7a3f\u7af9\u8179\u8dcc\u903b\u5782\u9075\u8109\u8c8c\u67cf\u72f1\u731c\u601c\u60d1\u9676\u517d\u5e10\u9970\u8d37\u660c\u53d9\u8eba\u94a2\u6c9f\u5bc4\u6276\u94fa\u9093\u5bff\u60e7\u8be2\u6c64\u76d7\u80a5\u5c1d\u5306\u8f89\u5948\u6263\u5ef7\u6fb3\u561b\u8463\u8fc1\u51dd\u6170\u538c\u810f\u817e\u5e7d\u6028\u978b\u4e22\u57cb\u6cc9\u6d8c\u8f96\u8eb2\u664b\u7d2b\u8270\u9b4f\u543e\u614c\u795d\u90ae\u5410\u72e0\u9274\u66f0\u68b0\u54ac\u90bb\u8d64\u6324\u5f2f\u6905\u966a\u5272\u63ed\u97e6\u609f\u806a\u96fe\u950b\u68af\u732b\u7965\u9614\u8a89\u7b79\u4e1b\u7275\u9e23\u6c88\u9601\u7a46\u5c48\u65e8\u8896\u730e\u81c2\u86c7\u8d3a\u67f1\u629b\u9f20\u745f\u6208\u7262\u900a\u8fc8\u6b3a\u5428\u7434\u8870\u74f6\u607c\u71d5\u4ef2\u8bf1\u72fc\u6c60\u75bc\u5362\u4ed7\u51a0\u7c92\u9065\u5415\u7384\u5c18\u51af\u629a\u6d45\u6566\u7ea0\u94bb\u6676\u5c82\u5ce1\u82cd\u55b7\u8017\u51cc\u6572\u83cc\u8d54\u6d82\u7cb9\u6241\u4e8f\u5bc2\u7164\u718a\u606d\u6e7f\u5faa\u6696\u7cd6\u8d4b\u6291\u79e9\u5e3d\u54c0\u5bbf\u8e0f\u70c2\u8881\u4faf\u6296\u5939\u6606\u809d\u64e6\u732a\u70bc\u6052\u614e\u642c\u7ebd\u7eb9\u73bb\u6e14\u78c1\u94dc\u9f7f\u8de8\u62bc\u6016\u6f20\u75b2\u53db\u9063\u5179\u796d\u9189\u62f3\u5f25\u659c\u6863\u7a00\u6377\u80a4\u75ab\u80bf\u8c46\u524a\u5c97\u6643\u541e\u5b8f\u764c\u809a\u96b6\u5c65\u6da8\u8000\u626d\u575b\u62e8\u6c83\u7ed8\u4f10\u582a\u4ec6\u90ed\u727a\u6b7c\u5893\u96c7\u5ec9\u5951\u62fc\u60e9\u6349\u8986\u5237\u52ab\u5acc\u74dc\u6b47\u96d5\u95f7\u4e73\u4e32\u5a03\u7f34\u5524\u8d62\u83b2\u9738\u6843\u59a5\u7626\u642d\u8d74\u5cb3\u5609\u8231\u4fca\u5740\u5e9e\u8015\u9510\u7f1d\u6094\u9080\u73b2\u60df\u65a5\u5b85\u6dfb\u6316\u5475\u8bbc\u6c27\u6d69\u7fbd\u65a4\u9177\u63a0\u5996\u7978\u4f8d\u4e59\u59a8\u8d2a\u6323\u6c6a\u5c3f\u8389\u60ac\u5507\u7ff0\u4ed3\u8f68\u679a\u76d0\u89c8\u5085\u5e05\u5e99\u82ac\u5c4f\u5bfa\u80d6\u7483\u611a\u6ef4\u758f\u8427\u59ff\u98a4\u4e11\u52a3\u67ef\u5bf8\u6254\u76ef\u8fb1\u5339\u4ff1\u8fa8\u997f\u8702\u54e6\u8154\u90c1\u6e83\u8c28\u7cdf\u845b\u82d7\u80a0\u5fcc\u6e9c\u9e3f\u7235\u9e4f\u9e70\u7b3c\u4e18\u6842\u6ecb\u804a\u6321\u7eb2\u808c\u8328\u58f3\u75d5\u7897\u7a74\u8180\u5353\u8d24\u5367\u819c\u6bc5\u9526\u6b20\u54e9\u51fd\u832b\u6602\u859b\u76b1\u5938\u8c6b\u80c3\u820c\u5265\u50b2\u62fe\u7a9d\u7741\u643a\u9675\u54fc\u68c9\u6674\u94c3\u586b\u9972\u6e34\u543b\u626e\u9006\u8106\u5598\u7f69\u535c\u7089\u67f4\u6109\u7ef3\u80ce\u84c4\u7720\u7aed\u5582\u50bb\u6155\u6d51\u5978\u6247\u67dc\u60a6\u62e6\u8bde\u9971\u4e7e\u6ce1\u8d3c\u4ead\u5915\u7239\u916c\u5112\u59fb\u5375\u6c1b\u6cc4\u6746\u6328\u50e7\u871c\u541f\u7329\u9042\u72ed\u8096\u751c\u971e\u9a73\u88d5\u987d\u65bc\u6458\u77ee\u79d2\u537f\u755c\u54bd\u62ab\u8f85\u52fe\u76c6\u7586\u8d4c\u5851\u754f\u5435\u56ca\u55ef\u6cca\u80ba\u9aa4\u7f20\u5188\u7f9e\u77aa\u540a\u8d3e\u6f0f\u6591\u6d9b\u60a0\u9e7f\u4fd8\u9521\u5351\u846c\u94ed\u6ee9\u5ac1\u50ac\u7487\u7fc5\u76d2\u86ee\u77e3\u6f58\u6b67\u8d50\u9c8d\u9505\u5eca\u62c6\u704c\u52c9\u76f2\u5bb0\u4f50\u5565\u80c0\u626f\u79a7\u8fbd\u62b9\u7b52\u68cb\u88e4\u5509\u6734\u5490\u5b55\u8a93\u5589\u5984\u62d8\u94fe\u9a70\u680f\u901d\u7a83\u8273\u81ed\u7ea4\u7391\u68f5\u8d81\u5320\u76c8\u7fc1\u6101\u77ac\u5a74\u5b5d\u9888\u5018\u6d59\u8c05\u853d\u7545\u8d60\u59ae\u838e\u5c09\u51bb\u8dea\u95ef\u8461\u5f8c\u53a8\u9e2d\u98a0\u906e\u8c0a\u5733\u5401\u4ed1\u8f9f\u7624\u5ac2\u9640\u6846\u8c2d\u4ea8\u94a6\u5eb8\u6b49\u829d\u543c\u752b\u886b\u644a\u5bb4\u5631\u8877\u5a07\u9655\u77e9\u6d66\u8bb6\u8038\u88f8\u78a7\u6467\u85aa\u6dcb\u803b\u80f6\u5c60\u9e45\u9965\u76fc\u8116\u8679\u7fe0\u5d29\u8d26\u840d\u9022\u8d5a\u6491\u7fd4\u5021\u7ef5\u7334\u67af\u5deb\u662d\u6014\u6e0a\u51d1\u6eaa\u8822\u7985\u9610\u65fa\u5bd3\u85e4\u532a\u4f1e\u7891\u632a\u743c\u8102\u8c0e\u6168\u83e9\u8404\u72ee\u6398\u6284\u5cad\u6655\u902e\u780d\u638f\u72c4\u6670\u7f55\u633d\u813e\u821f\u75f4\u8521\u526a\u810a\u5f13\u61d2\u53c9\u62d0\u5583\u50da\u6350\u59ca\u9a9a\u62d3\u6b6a\u7c98\u67c4\u5751\u964c\u7a84\u6e58\u5146\u5d16\u9a84\u5239\u97ad\u8292\u7b4b\u8058\u94a9\u68cd\u56b7\u817a\u5f26\u7130\u800d\u4fef\u5398\u6123\u53a6\u6073\u9976\u9489\u5be1\u61be\u6454\u53e0\u60f9\u55bb\u8c31\u6127\u714c\u5fbd\u6eb6\u5760\u715e\u5dfe\u6ee5\u6d12\u5835\u74f7\u5492\u59e8\u68d2\u90e1\u6d74\u5a9a\u7a23\u6dee\u54ce\u5c41\u6f06\u6deb\u5de2\u5429\u64b0\u5578\u6ede\u73ab\u7855\u9493\u8776\u819d\u59da\u8302\u8eaf\u540f\u733f\u5be8\u6055\u6e20\u621a\u8fb0\u8236\u9881\u60f6\u72d0\u8bbd\u7b28\u888d\u5632\u5561\u6cfc\u8854\u5026\u6db5\u96c0\u65ec\u50f5\u6495\u80a2\u5784\u5937\u9038\u8305\u4fa8\u8206\u7a91\u6d85\u84b2\u8c26\u676d\u5662\u5f0a\u52cb\u522e\u90ca\u51c4\u6367\u6d78\u7816\u9f0e\u7bee\u84b8\u997c\u4ea9\u80be\u9661\u722a\u5154\u6bb7\u8d1e\u8350\u54d1\u70ad\u575f\u7728\u640f\u54b3\u62e2\u8205\u6627\u64c5\u723d\u5496\u6401\u7984\u96cc\u54e8\u5de9\u7ee2\u87ba\u88f9\u6614\u8f69\u8c2c\u8c0d\u9f9f\u5ab3\u59dc\u778e\u51a4\u9e26\u84ec\u5df7\u7433\u683d\u6cbe\u8bc8\u658b\u7792\u5f6a\u5384\u54a8\u7eba\u7f50\u6876\u58e4\u7cd5\u9882\u81a8\u8c10\u5792\u5495\u9699\u8fa3\u7ed1\u5ba0\u563f\u5151\u9709\u632b\u7a3d\u8f90\u4e5e\u7eb1\u88d9\u563b\u54c7\u7ee3\u6756\u5858\u884d\u8f74\u6500\u818a\u8b6c\u658c\u7948\u8e22\u8086\u574e\u8f7f\u68da\u6ce3\u5c61\u8e81\u90b1\u51f0\u6ea2\u690e\u7838\u8d9f\u5e18\u5e06\u6816\u7a9c\u4e38\u65a9\u5824\u584c\u8d29\u53a2\u6380\u5580\u4e56\u8c1c\u634f\u960e\u6ee8\u864f\u5319\u82a6\u82f9\u5378\u6cbc\u94a5\u682a\u7977\u5256\u7199\u54d7\u5288\u602f\u68e0\u80f3\u6869\u7470\u5a31\u5a36\u6cab\u55d3\u8e72\u711a\u6dd8\u5ae9\u97f5\u886c\u5308\u94a7\u7ad6\u5cfb\u8c79\u635e\u83ca\u9119\u9b44\u515c\u54c4\u9896\u9551\u5c51\u8681\u58f6\u6021\u6e17\u79c3\u8fe6\u65f1\u54df\u54b8\u7109\u8c34\u5b9b\u7a3b\u94f8\u953b\u4f3d\u8a79\u6bd9\u604d\u8d2c\u70db\u9a87\u82af\u6c41\u6853\u574a\u9a74\u673d\u9756\u4f63\u6c5d\u788c\u8fc4\u5180\u8346\u5d14\u96c1\u7ec5\u73ca\u699c\u8bf5\u508d\u5f66\u9187\u7b1b\u79bd\u52ff\u5a1f\u7784\u5e62\u5bc7\u7779\u8d3f\u8e29\u9706\u545c\u62f1\u5983\u8511\u8c15\u7f1a\u8be1\u7bf7\u6df9\u8155\u716e\u5029\u5352\u52d8\u99a8\u9017\u7538\u8d31\u7092\u707f\u655e\u8721\u56da\u6817\u8f9c\u57ab\u5992\u9b41\u8c23\u5bde\u8700\u7529\u6daf\u6795\u4e10\u6cf3\u594e\u6ccc\u903e\u53ee\u9edb\u71e5\u63b7\u85c9\u67a2\u618e\u9cb8\u5f18\u501a\u4fae\u85e9\u62c2\u9e64\u8680\u6d46\u8299\u5783\u70e4\u6652\u971c\u527f\u8574\u573e\u7ef8\u5c7f\u6c22\u9a7c\u5986\u6346\u94c5\u901b\u6dd1\u69b4\u4e19\u75d2\u949e\u8e44\u72ac\u8eac\u663c\u85fb\u86db\u8910\u988a\u5960\u52df\u803d\u8e48\u964b\u4fa3\u9b45\u5c9a\u4f84\u8650\u5815\u965b\u83b9\u836b\u72e1\u9600\u7ede\u818f\u57ae\u830e\u7f05\u5587\u7ed2\u6405\u51f3\u68ad\u4e2b\u59ec\u8bcf\u94ae\u68fa\u803f\u7f14\u61c8\u5ac9\u7076\u5300\u55e3\u9e3d\u6fa1\u51ff\u7eac\u6cb8\u7574\u5203\u904f\u70c1\u55c5\u53ed\u71ac\u77a5\u9ab8\u5962\u62d9\u680b\u6bef\u6850\u7802\u83bd\u6cfb\u576a\u68b3\u6749\u6664\u7a1a\u852c\u8747\u6363\u9877\u9ebd\u5c34\u9556\u8be7\u5c2c\u786b\u56bc\u7fa1\u6ca6\u6caa\u65f7\u5f6c\u82bd\u72f8\u51a5\u78b3\u54a7\u60d5\u6691\u54af\u841d\u6c79\u8165\u7aa5\u4ffa\u6f6d\u5d0e\u9e9f\u6361\u62ef\u53a5\u6f84\u840e\u54c9\u6da1\u6ed4\u6687\u6eaf\u9cde\u917f\u8335\u6115\u7785\u66ae\u8859\u8beb\u65a7\u516e\u7115\u68d5\u4f51\u5636\u5993\u55a7\u84c9\u5220\u6a31\u4f3a\u55e1\u5a25\u68a2\u575d\u8695\u6577\u6f9c\u674f\u7ee5\u51b6\u5e87\u6320\u6402\u500f\u8042\u5a49\u566a\u7a3c\u9ccd\u83f1\u76cf\u533f\u5431\u5bdd\u63fd\u9ad3\u79c9\u54fa\u77e2\u556a\u5e1c\u90b5\u55fd\u631f\u7f38\u63c9\u817b\u9a6f\u7f06\u664c\u762b\u8d2e\u89c5\u6726\u50fb\u968b\u8513\u548b\u5d4c\u8654\u7554\u7410\u789f\u6da9\u80e7\u561f\u8e66\u51a2\u6d4f\u88d4\u895f\u53e8\u8bc0\u65ed\u867e\u7c3f\u5564\u64d2\u67a3\u560e\u82d1\u725f\u5455\u9a86\u51f8\u7184\u5140\u5594\u88f3\u51f9\u8d4e\u5c6f\u819b\u6d47\u707c\u88d8\u7830\u68d8\u6a61\u78b1\u804b\u59e5\u745c\u6bcb\u5a05\u6cae\u840c\u4fcf\u9eef\u6487\u7c9f\u7caa\u5c39\u82df\u766b\u8682\u79b9\u5ed6\u4fed\u5e16\u714e\u7f15\u7aa6\u7c07\u68f1\u53e9\u5450\u7476\u5885\u83ba\u70eb\u86d9\u6b79\u4f36\u8471\u54ee\u7729\u5764\u5ed3\u8bb3\u557c\u4e4d\u74e3\u77eb\u8dcb\u6789\u6897\u5395\u7422\u8ba5\u91c9\u7a9f\u655b\u8f7c\u5e90\u80da\u547b\u7ef0\u627c\u61ff\u70af\u7aff\u6177\u865e\u9524\u6813\u6868\u868a\u78c5\u5b7d\u60ed\u6233\u7980\u9102\u9988\u57a3\u6e85\u549a\u9499\u7901\u5f70\u8c41\u772f\u78f7\u96ef\u589f\u8fc2\u77bb\u9885\u7409\u60bc\u8774\u62e3\u6e3a\u7737\u60af\u6c70\u6151\u5a76\u6590\u5618\u9576\u7095\u5ba6\u8db4\u7ef7\u7a98\u8944\u73c0\u56a3\u62da\u914c\u6d4a\u6bd3\u64bc\u55dc\u625b\u5ced\u78d5\u7fd8\u69fd\u6dcc\u6805\u9893\u718f\u745b\u9890\u5fd6\'\n\n# read in the fonts from the provided files\nfont_paths = glob.glob("./font_files/*")\nfonts = [ImageFont.truetype(p, 75, 0) for p in font_paths]\n\n# Here are a few functions for creating an image of a character from fonts:\n\n# add some blur to avoid unrealistically sharp edges on characters\ndef blur_image(im, kernel=(0,0), std=1):\n return Image.fromarray(cv2.GaussianBlur(np.array(im), kernel, std))\n\n# trim excessive whitespace around the character\ndef trim(im):\n bg = Image.new(im.mode, im.size, im.getpixel((0,0)))\n diff = ImageChops.difference(im, bg)\n diff = ImageChops.add(diff, diff, 2.0, -100)\n bbox = diff.getbbox()\n if bbox:\n return im.crop(bbox)\n else:\n return im\n\n# create an image from a font file \ndef setup_char_image(char, font, shape=(75,75)):\n image = np.ones(shape=shape, dtype=np.uint8)\n x = Image.fromarray(image)\n draw = ImageDraw.Draw(x)\n draw.text((0,0),char,(0),font=font)\n p = ((255.0 / np.array(x).max() * (np.array(x) - np.array(x).min()))\n .astype(np.uint8))\n return Image.fromarray(p)\n\n# create the images using the functions above\ndef create_char_image(charset,fonts):\n # the images will be stored in folders labeled as the respective\n # char - some chars cannot be used in folder names in some OS\n forbidden_chars = {"<": "less_than",\n ">": "greater_than",\n ":": "colon",\n \'"\': "double_quote",\n "/": "forward_slash",\n "\\\\": "backslash",\n "|": "vertical_bar",\n "?": "question_mark",\n "*": "asterisk",\n ".": "full_stop"}\n \n # iterate over all chars and fonts\n for char in charset:\n for font in fonts:\n dir_name = (char \n if not char in forbidden_chars.keys() \n else forbidden_chars[char])\n file_name = str(font)[38:-1]\n \n save_path = "data/" + dir_name\n if not os.path.exists(save_path):\n os.makedirs(save_path)\n \n trim(\n blur_image(\n setup_char_image(char,font)\n )).resize((75,75)\n ).save(save_path + "/" + file_name + ".png")\n\n\nif __name__ == "__main__":\n # create images of chinese chars in charset form the provided fonts files\n create_char_image(charset,fonts)\n```\nI have provided this script along with the font files as a [public data set](https://www.kaggle.com/pascalbliem/chinese-characters-from-handwritingstyle-fonts) on Kaggle, feel free to use it yourself. Let\'s now have a look at the images this method produced.\n\nImages of the Chinese character \u4e24, generated from font files.\n\nThe results are actually not bad. Some of the characters do really look quite a bit like handwriting. However, even with data augmentation, I knew about 120 images per character were not going to be enough if I want to classify thousands of different characters. \n\n### Getting real handwritten data\n\nI kept searching for data sets of actual handwritten characters and eventually found some resources. One of them were two datasets from the Harbin Institute of Technology called [HIT-MW](https://sites.google.com/site/hitmwdb/) and [HIT-OR3C](http://www.iapr-tc11.org/mediawiki/index.php?title=Harbin_Institute_of_Technology_Opening_Recognition_Corpus_for_Chinese_Characters_(HIT-OR3C)). Another great recourse I cam across were the [CASIA Online and Offline Chinese Handwriting Databases](http://www.nlpr.ia.ac.cn/databases/handwriting/Home.html) created by the Chinese National Laboratory of Pattern Recognition (NLPR) and the Institute of Automation of Chinese Academy of Sciences (CASIA). They had collected about 3.9 million writing samples of 1020 writers, overall 7185 Chinese characters and 171 symbols. This sounded pretty impressive to me and I saw that [some](https://github.com/soloice/Chinese-Character-Recognition) [other](https://pdfs.semanticscholar.org/4941/aed85462968e9918110b4ba740c56030fd23.pdf) works had successfully used this dataset, so I decided to go with this one as well. The data files are available for [download](http://www.nlpr.ia.ac.cn/databases/handwriting/Download.html) on their website; however, not exactly *ready to use*.\n\nInstead of image files, the authors provided their data in the form of binary files with some custom encoding:\n\n
\n\n \n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n
ItemTypeLengthInstanceComment
Sample sizeunsigned int4B Number of bytes for one sample (byte count to next sample)
Tag code (GB)char2B"\u554a"=0xb0a1 Stored as 0xa1b0 
Widthunsigned short2B Number of pixels in a row
Heightunsigned short2B Number of rows
Bitmapunsigned charWidth*Height bytes Stored row by row
\n
\n\nIt took me a while to figure out the proper way to handle binary files with Python. The `struct` package seems to have everything needed to customly decode everything byte by byte (you can find a list of all `struct` format characters [here](https://docs.python.org/3/library/struct.html#format-characters)) and with `PIL`, we can turn it into image files:\n\n```python\n# handling binary files\nimport struct\n# image processing\nfrom PIL import Image, ImageDraw\nimport numpy as np\n# some I/O functionality\nimport glob\nimport os\n\n# path to folder containing the unzipped binary files\ndata_folder = "Gnt1.0TrainPart1"\n# path of the folder the images should be saved in\ntrain_test = "Train"\n\n# iterate over all the unpacked .gnt binary files\nfor path in glob.glob(data_folder + "/*.gnt"):\n \n filesize = os.path.getsize(path)\n \n with open(path, "rb") as file:\n content = file.read()\n \n # while the counter is smaller than the size of the file, keep iterating\n counter = 0\n while counter < filesize:\n \n # size in bytes of one character sample\n sample_size = struct.unpack("I",content[counter:counter+4])[0]\n \n # unpack th two tag codes that represent the character label\n # and merge them together (ignoring NULL bytes b\'\\x00\')\n tag_code1 = struct.unpack("cc",content[counter+4:counter+6])[0]\n tag_code2 = struct.unpack("cc",content[counter+4:counter+6])[1]\n tag_code = ((tag_code1 + tag_code2).decode("GBK") \n if tag_code2 != b\'\\x00\' \n else (tag_code1).decode("GBK"))\n \n # the images will be stored in folders labeled as the respective\n # character - some chars cannot be used in folder names in some OS\n forbidden_chars = {"<": "less_than",\n ">": "greater_than",\n ":": "colon",\n \'"\': "double_quote",\n "/": "forward_slash",\n "\\\\": "backslash",\n "|": "vertical_bar",\n "?": "question_mark",\n "*": "asterisk",\n ".": "full_stop"}\n \n if tag_code in forbidden_chars.keys():\n tag_code = forbidden_chars[tag_code]\n \n # unpack width and hight of the writing sample\n width = struct.unpack("H",content[counter+6:counter+8])[0]\n height = struct.unpack("H",content[counter+8:counter+10])[0]\n area = width * height\n \n # unpack the bitmap that represents the image of the writing sample\n bitmap = (np.array(struct.unpack("B" * area,\n content[counter+10:counter+10+area]))\n .reshape(height,width))\n \n bitmap = np.where(bitmap!=255,\n ((255.0 / bitmap.max() * (bitmap - bitmap.min())).astype(np.uint8)),\n bitmap).astype(np.uint8)\n \n # create an image object from the bitmap\n image = Image.fromarray(bitmap)\n ImageDraw.Draw(image)\n \n # save the image in a folder labeled as the corresponding character\n save_path = train_test + f"/{tag_code}"\n if not os.path.exists(save_path):\n os.makedirs(save_path)\n file_name = str(len(glob.glob(save_path + "/*"))+1) + ".png"\n image.save(save_path + "/" + file_name)\n \n # increment the byte counter \n counter += sample_size\n\n```\nA sample of the resulting character images can be seen in the figure below. It is quite remarkable how different the handwriting styles of different writers can be. For some of the images, I couldn\'t have said with certainty what character they display. This may be even more challenging for a machine learning algorithm and highlights why many currently available OCR apps fail at recognizing handwriting.\n\nImages of the Chinese character \u4e24, written by hand by different writers.\n\nDownloading, unpacking, and processing all this data took quite a while. If you want to skip the effort and work with image files directly, I have put them on Kaggle as a [public dataset](https://www.kaggle.com/pascalbliem/handwritten-chinese-character-hanzi-datasets). Feel free to use them and tell me about your results.\n\n### What next?\n\nI originally had the intention to build an OCR app based on this data set. There are plenty of OCR apps for printed characters available and I also found a [paper](https://pdfs.semanticscholar.org/4941/aed85462968e9918110b4ba740c56030fd23.pdf) from Stanford student in which they trained a deep convolutional neural network on a subsection of this dataset and got great accuracy (around 95%) even with more than 3000 classes. So, I started off quite optimistic. Unfortunately, I don\'t have a GPU and I realized that this task (classifying thousands of classes on millions of images) was absolutely intractable on my hardware, even if I had my backup laptop running for weeks. So, were could I get a GPU from without spending money? Since Kaggle has introduced its 30 hour limit, there is not much room for experimentation on this platform anymore. I tried to use [Google Colab](https://colab.research.google.com/), which offers free GPU time with some limitation, in combination with my Google Drive for persisting model checkpoints, but with little success. Somehow, Colab\'s file system couldn\'t handle unpacking millions of files and threw some errors all the time or leaving me with empty files which, of course, crashed TensorFlow when it tried to decode such "images". Buying GPU time from cloud providers isn\'t exactly cheap either. I\'m mostly active on AWS, where the cheapest GPU instance type costs around 1.3 USD per hour. I decided that I wasn\'t willing to spend a non-negligible amount of money on this project *just for fun* and ended it here.\n\nIf you have some powerful GPUs in your possession and an interest in Chinese OCR, why don\'t you try to pick up the challenge where I left of and tell me how it went? I\'d be really interested in seeing how well a handwriting OCR app would work.\n\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("How to Quarter-Life-Crisis your Way into Data Science","My personal story of how I left academia, traveled around Asia, and got into Data Science on the way",new Date("2020-03-27"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/pie-chart.png","When you've seen all sides of the pyramid, it's time to reach for the sky!",["Learning","Non-Tech"],"So this is going to be an article about how I got into data science and how that's the result of a series of slightly chaotic events. When I was starting out back then, I came across a lot of blog post from people who where documenting their stories, how they made the decision to change their career path, how they studied, and how - despite their struggles - their journey was usually very rewarding. Those posts always motivated me to keep going when I was questioning if I had set the right goals. Hence, I though I may just contribute such a piece of motivation for those who find themselves in the same situation as I did almost a year ago. A friend of mine recently wrote a great blog post titled [\"Quarter Life Crisis\" ](http://regitamhrdk.wixsite.com/website/post/quarter-life-crisis) (in Bahasa Indonesia), in which she pointed out that this phenomenon of anxiety about not knowing where to go in life, is actually a great opportunity to grow and advance beyond what we've done so far. I do absolutely agree with this and I am going to tell you how this worked out for me. \n\n**A word of warning:** This post is way too long, and not really the type of post from which you learn much about tech-stuff. It's my personal story and it is written as such. If you're (understandably) not interested in my personal background, feel free to skip to the section where I write about how I [learned data science](#learn).\n\n### I graduated, and now what? \n\nAt the time of writing this, I'm 27 years old. About two years ago, I was at a point at which everything seemed to be amazing. I had just graduated with honors from my master program in materials engineering from one of Germany's top engineering universities, after finishing a very though but successful master thesis, which even got [published](https://www.nature.com/articles/s41598-018-34042-1) in a peer-reviewed journal. I was looking back at 5 years of successful scholarly work and materials science research, many [amazing research topics](https://github.com/Pascal-Bliem/my-papers-and-theses), and research stays at universities in Sweden and Taiwan, as well as industrial internships. For the past years I had always imagined myself doing a PhD in materials science and becoming a senior scientist. Now, there I was at the point in life I had imagined for so long, having several PhD offers at hand but...somehow...something didn't feel right.\n\n![Watching the sunrise from within the laboratory tells you that you spent too much time in there.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/sunriselab.jpg)\n\nI looked back at the last couple of years and realized I had done most of the *PhD-kind-of-things* already; spending days and nights in the lab working on my projects, visiting fancy research facilities, (co)authoring papers, giving talks about my research etc. Did I really want to spend the next 4 years with what I already had been doing for the last 4 years? I did not. Yet, I had gotten so used to the idea of becoming a scientist and holding a PhD degree (maybe a little ego was involved there) that it wasn't easy at all to reconsider my options. I didn't know what to do - the PhD or...or what else? - but I knew that I wasn't happy the way that I though I would be. In addition, I was so tired of living in the same environment for years and just went through a break-up of a three years relationship. I didn't really have a backup plan at that point, but I just felt that I had to get out somehow. So when my friend told me, \"If you don't know what to do, just go travel the world!\", that's exactly what I did.\n\n### Let's not decide what to do with my life just now\n\nObviously, some people in my social surroundings thought I went downright insane, but there was also a lot of positive feedback for my idea from my friends and family. I sold pretty much everything I couldn't fit in one backpack, gave up my apartment, and booked a one-way-ticket to Bali. I told myself that I could just postpone the decision of what to do with my life for a couple of months and that I probably will be wiser after having seen more of the world. I backpacked through 10 countries in southeast Asia and the Indian subcontinent, snorkeling around tropical islands, driving motorbikes through tea fields and along coastlines, taking part in silent meditation retreats, hiking over the Himalayas, and floating down the Mekong river for days. I saw a lot of amazing stuff, but at some point I started feeling exhausted again of seeing new things everyday without really doing anything productive. I felt like I wasn't generating much value for anyone else, so I started looking for some useful work to do.\n\n![On the way to hopefully finding an idea what to do next...](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/world_map.png)\n\nSince I had always enjoyed teaching stuff, I thought I might give it a try and started volunteering as an English teacher in a school in a remote Vietnamese mountain town. That was a great experience, but I realized quickly that trying to explain the English language to little kids who don't speak a word of English yet, *using English*, didn't make too much sense. Next I tried volunteering in a [fantastic English school](http://ilec.edu.vn/) next to the technological university in Hanoi, talking English to young adult engineering students. This worked a lot better, but still, I realized that I really missed doing something science and tech related. So I changed countries and started volunteering as a Science and Maths teacher in a [school](https://jogjacommunityschool.org/) in Jogjakarta, Indonesia. The pedagogical part of that adventure was quite challenging and rewarding, the \"sciency\" part, however, was not. The concepts taught at a high school level were just too shallow and too far away from application to keep me interested. At that point I had to admit to myself that I was and always will be that tech guy who only feels fulfilled when wrapping his brains around complex problems and crafting solutions for them.\n\n### Maybe it's about time to come up with a plan now\n\nAt that point, about 9 months into my trip, I faced the postponed decision again. What am I going to do with my live? I was still quite sure about not wanting back into academic research. So, start working as a materials engineer in industry? I had done internships in that field before and I didn't really feel like I wanted to be working in industrial production again. Actually, I realized that most of the engineering job descriptions I found interesting, required decent coding skills. Maybe that was even the very reason why I found them so interesting. I was really interested into programming since I took a Java class in my bachelors and I had been using quite a bit of Python for automation and data processing when I was working on molecular dynamics or quantum mechanics simulations on a super-computer during my materials research. But next to my regular lab work back then, I never had the time to really professionalize my coding skills far enough to make myself employable in a coding job. One of those backlog items that never had made to *work-in-progress*.\n\nWas it the right time to tackle this now? Could I even do this by myself, without a formal education in the field? Many blog post authors and digital nomads I met on my travels told me that they had done something like that, so yeah, I should be able to succeed at it as well. But again, my ego (and some relatives of mine) had some doubts: \"You spend about 5 years becoming good at scientific research, had mentionable successes, and now you want to leave all that behind and start off as a beginner again?\" Anyway, there I was, sitting in my hot and tiny Kos (an indonesian boarding house) room next to an active volcano, in the wonderful and very cheap city of Jogjakarta, still some money on my bank account, basically no current job commitment, and already speaking decent Indonesian. If there was one right point in space and time to do it, it was this one. After having to go on a visa run to Malaysia shortly, I had made up my mind. I changed to another volunteering organization where I could work part-time and came back to Jogja with the ambition to spend the next couple of months self-teaching myself to ramp up my coding skills, become an actual programmer, and get a job within half a year.\n\n\n### How I figured out what I want and how to get there \nWhere the heck should I start?! Computer science seemed (and still does seem) so gigantic and overwhelming, so many sub-disciplines to chose from, and half of the vocabulary sounded like alien language to me at that time. I probably spent a whole week or so searching Google and YouTube for phrases like *how to teach yourself programming*, *what discipline of CS is right for me*, *what programming language to start with*, *please end my suffering* and so on. I wanted to be able to properly build cool software solutions, so the software engineering/developing aspect sounded appealing to me. I also wanted to keep doing experimental, investigative things, like I had done when I was still working in research. Luckily, I came across **Data Science** which seemed to perfectly unite those parts. The often cited quote \"A Data scientist is someone who is better at statistics than a software engineer and better at software engineering than a statistician\" appealed to me as a job description and, furthermore, I actually already had quite some suitable experience: I had been analyzing my data with Python (the most popular language in Data science) before, lots of experience in looking for patterns in data, and a good understanding of the linear algebra and differential calculus behind most machine learning models. So now I knew *where* to go, I just had to figure out *how* to get there.\n\n### Massive Open Online Courses (MOOCs)\nI definitely wasn't going to go back to university for a whole program, but I also didn't get much out of all the tiny YouTube and Medium blog post tutorials that started treating a problem somewhere in the middle and left off before the end. In needed something more holistic which I could still work on flexibly, following my own schedule (and budget). Some presumably great options like [Udacity](https://www.udacity.com/) nano degrees or [Data Camp's](https://www.datacamp.com/) different tracks were unfortunately out of the price range I was looking for (which meant for free). There was unfortunately not much about data science on [FreeCodeCamp](https://www.freecodecamp.org/). [Udemy](https://www.udemy.com/) also seemed to have some great courses on a discount price sometimes. Nowadays I'm using Udemy a lot with a business account, but back then (without a salary, living on less then 10 dollars per day) I had to get it cheaper. Luckily, I came across MOOCs (massive open online courses) on platforms like [Coursera](https://www.coursera.org/) and [EdX](https://www.edx.org/), which demand payment for getting a course certificate, but let you audit the courses for free. \n\n![A small selection of all the amazing sources of data science and software development knowledge.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/courses.png)\n\nAfter browsing through their catalogues I decided to go with Coursera because they offered some specializations which seemed to suit my demands quite well. And maybe also because they offered 7 days free trials which I shamelessly abused to cram some courses and get the certificates for free. Anyway, I went with the [Applied Data Science with Python](https://www.coursera.org/specializations/data-science-python) specialization offered by the University of Michigan, which consisted of five courses covering data manipulation and processing, visualization, applied machine learning, text mining, and network analysis. This gave me a pretty good overview about what different branches data science consists of, and especially the machine learning part got me hooked. Theses courses did, however, not elaborate on any general programming or software engineering concepts. To make up for this shortcoming, I also took the [Object Oriented Java Programming: Data Structures and Beyond](https://www.coursera.org/specializations/java-object-oriented) specialization offered by truly fantastic instructors of the University of California San Diego. The individual courses covered the most common algorithms and data structures for sorting, searching, and graph operations, and I finished the specialization with a [project on social network analysis](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance).\n\n\n### Podcasts - Your best friends while on the road\n\nI was very impatient with learning as much as I could in as little time as possible because I wanted to feel less like an absolute beginner as soon as possible and, also, I was running out of money and needed to get a job at some point. I wondered how I could keep learning about data science and coding when I couldn't do my online courses, for example while I was sitting on my motorbike or strolling over the market to get some veggies and tofu. I started looking for podcasts and found quite a handful that were really good. I loved listening to [Partially Derivative](http://partiallyderivative.com/) or [Data Framed](https://www.datacamp.com/community/podcast) (which are unfortunately discontinued) while I was driving through Javanese country roads and rice fields and I'm still listening to [Data Skeptic](https://dataskeptic.com/), [Linear Digressions](http://lineardigressions.com), the [Data Engineering Podcast](https://www.dataengineeringpodcast.com/) and [Developer Tea](https://spec.fm/podcasts/developer-tea) almost every day. Despite not paying perfect attention all the time and initially not understanding most of the concepts, it was great to be fully immersed in data science topics all the time and it helped to expand my mental map of all the topics encompassed under that big umbrella term of *Data Science*. After every new episode, I had a dozen new concepts to look up and it always gave me a little confidence boost when I realized that I already understood a little more when listening to the next episode.\n\n![Some of the podcasts that I used for learning data science when I could have my eyes on a screen.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/podcasts.png)\n\n### Building a portfolio with real projects\nAbout two and a half months into the endeavour, I got to the point where I had finished up most of the course work that I had planned for and had gotten a good theoretical grasp of most things I wanted to learn, but I hadn't actually build a lot of original stuff. The next step was to apply what I had learned in some end-to-end (yet manageably small) projects. That was particularly important to me because I figured that - having hold no prior job with the title of Data Scientist and not having a perfectly matching university degree - I definitely needed some portfolio projects to present to potential employers. Okay, but what should I do? When you google for \"how to come up with project ideas\", you'll find a lot of posts telling you that you should build applications that are going to solve your personal problems or cater your needs in some way. But most *data products* I could think of would probably take a handful of developers a couple of months to build. What *smaller* problems did I want to solve? And which of those were original? I had seen the Titanic, Iris flower, and housing price data sets so often that I wondered if there aren't any other data sets out there. I searched Kaggle and the [UCI Machine Learning Repository](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository) and did some machine learning tasks on what I found, but with those prepared data sets, it didn't feel like solving real world problems either. Should I be building a visual WhatsApp [chat log analysis](https://github.com/Pascal-Bliem/whatsapp-chatlog-analysis) to disprove my friend's false claims about what she had said when and in which way? That wasn't really enough. Trying to understand certain concepts better that I had struggled with before, such as [error rate control](https://github.com/Pascal-Bliem/error-control-statistical-tests) in statistical significance testing? Very interesting but not really end-to-end either. \n\n![It's easy to get a little confused about hypothesis testing and how false positives, false negatives, and statistical power influence each other.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/falsepositivenegative.png)\n\nThen it struck me that I was (presumably) going back to Europe fairly soon to look for work. I had been away from home for more than a year and I hardly ever hung out with other Europeans during that time. Even before I departed, I spend most of my time in my very academic social bubble, and I felt a little detached from the European people. Time to reintegrate, I thought! After some searching, I came across the amazing [European Social Survey](https://www.europeansocialsurvey.org/), which describes itself as \"an academically driven cross-national survey that has been conducted across Europe since its establishment in 2001\". Extracting information from this data would be a great way to mentally prepare myself for getting back to Europe and get an original [portfolio project](https://github.com/Pascal-Bliem/european-social-survey) which I could present to potential future employers. The fairly large data sets held the answers of tens of thousands respondents from to face-to-face interviews with hundreds of questions in over 20 participating countries. It had all sorts of info on social topics, climate change, politics, immigration, individual well-being, and much more. And also, being scrambled together from dozens of universities, it was messy: nonsensical variable encodings, inhomogeneous scales, country specific questions and so on. I spend probably most of the time digging through the documentation and cleaning/preprocessing the data. But then, I could do exploratory analysis of variable correlations with interactive visualizations, investigate differences between countries with statistical hypothesis testing and determine their effect sizes, and - most amazingly - predict individual peoples' personally perceived happiness with machine learning, using their other answers to survey questions as features. It turned out to be that proper end-to-end project I had been looking for and I learned a lot along the way, both regarding data science methodology, as well as the social insights of the survey.\n\n### The job search\nNow, about three and a half months into the entire process, I felt like I really had the must-haves for being employable in my repertoire, but I caught myself having the same thoughts over and over: I gotta study one more of these statistical concepts, then I'll apply for jobs, got to check out one more of these cool libraries, understand this family of algorithms, read this one paper or post, then I'll start applying for jobs. I was just in the process of trying to get a convolutional neural network to tell images of cats and dogs apart, when I finally got to the conclusion that it may be a little more important to care about my future than building a silly pet detector. It was still one and a half months to go until my flight back to Germany and I wouldn't be able to go to interviews right away if I were to be invited, so I first aimed at looking for jobs in Singapore, the closest industrialized country. Looking for jobs is always a daunting process but I wasn't quite prepared for how bad it would start off for me. Most companies didn't even reply. On top of that, tons of (probably automated) rejection mails. I though that maybe the Singaporean job boards were useless so I tried to track hiring people from Asian tech giants like Grab, Gojek, Shopee etc. on LinkedIn - and no one replied. I felt like all the work of the last couple of months had been in vain. Apparently no one considered me employable, no one seemed to be interested in my projects or even wanted to talk to me. Was the entire decision one big mistake? I felt defeated. Until now, I don't get what was the secret sauce of the Singaporean job market; maybe it's personal referrals, which are important in a strongly Chinese influenced culture; maybe its the fact that one is always competing with lots of local university graduates for entry-level positions. Having wasted my time with sending applications to Singapore, I finally turned my focus towards Europe again, and Germany in particular. \n\nAgain, I wasn't quite prepared for the feedback I got, but now it was the other way around. I had applied to many vacancies and put up my job search request on several European job boards, and suddenly, a ton of recruiting agencies and hiring managers wanted to talk to me. Now I had to stay up late at night (due to the different time zones) to attend several rounds of phone or video call interviews with about 30 companies and agencies. I even had to start turning people down because I couldn't fit them into my calendar anymore. Apparently the German IT job market at that time was in very strong demand of skilled workers so that people wanted to hire me for all sorts of developer jobs, not only for data science and AI. That also meant that the interviews were of very varying quality. Many of these third-party recruiting agencies had clearly no idea about the technologies or even about the clients they were hiring for. Neither did they bother to thoroughly read my CV or check my LinkedIn before pulling me into hour-long phone calls, which would have answered most of their questions in advance. I often had to narrate my job-search-story over and over again until I felt like a stuck vinyl record. Some other interviews were great though, despite sometimes non-ideal conditions. The company that ended up hiring me, e.g., interviewed me while I was sitting in a Hanoi co-working space with a horrible internet connection and lots of motorcycle noise from the busy streets outside. The month or so that I spent on so many calls was tough; not only because of the job search itself, but also because I didn't have much time to keep working on my personal project or got to spent more time with my friends in Indonesia and Vietnam whom I wouldn't be seeing again for a long while. However, the efforts did pay off, as I was finally ready to return to Germany and face the final stage of my job search.\n\n### Europe - The Final Countdown (pun intended)\nAfter a couple of awfully long flights and stopovers, for the first time in almost one and a half years, I stepped on winterly German soil again. Everything felt a little unreal to me; the temperature drop of about 30\xb0C, how *orderly* everything was compared to Indonesia, temporarily moving in with my family, and drinking amazing German beer again. I wasn't left with much time to acclimatize though, as I was thrown into a marathon of on-sight job interviews all over the country. It was two weeks till Christmas, everyone wanted to get their hiring processes finished before the holidays, and that meant that I spent those two weeks entirely on the car or train, driving from one city to another. After having two interviews in one day in the south of Germany, I took a night train to the mid-west, slept a few hours in a train station hostel, and squeezed in the interview with the company that ended up hiring me, only to rush on to the next city and next interview. Most of the job interviews went pretty well, despite the rush and my (perceived) lack of preparation. Nonetheless, many of them didn't really go into that generalist data science direction I was looking for the most. One of the large insurance companies I interviewed with, however, was just in the process of building up a new data science team that was supposed to be employed in a very wide range of use cases. Needless to say, I was very happy when they called and told me that they wanted me for the job and that they'd send me the contract...after the works council had signed the hiring decision...which wasn't going to happen until two days before Christmas. The next days, in which I tried stalling the other offers I had already gotten, were quite tense until I finally got the call saying that I got the job, literally on the last day before the holidays. \n\nI was able to relax a bit...for about half a day, until I realized that now, I had to start looking for a place to live in a city with one of Germany's toughest housing markets. That city also happened to my place of birth - kind of went full circle here. I was, again, incredibly lucky and found a very tiny and expensive room in a shared apartment very close to my future office. I moved there with everything I owned at that point; my laptop, one suitcase, one backpack, and a box full of hats form seven different countries. And there I was, in the new Data Scientist job for which I had been working so hard for the last six months. I got a massive amount of input within the first couple of weeks, talked to dozens of people and tried to get an overview over all the different IT systems I'd have to be working with. I can confidently claim that an enterprise with almost 120 years of history and about 20000 employees is nothing short of complexity. Despite the inertia that such a large company can have at times, there is a huge wealth of potential in its data and plenty of possibilities to contribute business value with data science methods. I now work in an awesome team of data scientists, data engineers, and data strategists on use cases about churn and retention, customer value models, recommender systems, marketing support, process automation with text mining, and many other fascinating topic. This work gives me a great chance to take the research mindset I cultivated in academia, combine it with powerful software engineering, and transform it into real value for the business. Looking back now at everything I have done since my master graduation, I can say that all the decisions and efforts - not going for the PhD, traveling the world, and breaking into data science - were totally worth it. It was an amazing time and experience and it makes me even more look forward to all the great thing to come.\n\n![The view from besides my new office onto my place, or as people here say: Home is where the Dom is.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/quarter-life/rhineanddome.jpg)\n\nIn case you made it this far, thanks a lot for reading! If you have a similar story or you are planning to change your career path into data science or development, feel free to contact me and exchange some ideas on this exciting topic.\n\nCheers,\nPascal")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Exploring less common machine learning data sets","Getting beyond Iris flowers, Titanic survivors, and housing prices",new Date("2020-01-09"),"https://thumbor.forbes.com/thumbor/960x0/https%3A%2F%2Fspecials-images.forbesimg.com%2Fdam%2Fimageserve%2F966248982%2F960x0.jpg%3Ffit%3Dscale","Finding some new stuff to learn for our machines.",["Data Science & AI/ML","Learning"],'Probably everyone who starts to learn machine learning will, right away, come across plenty of tutorials or blog posts that uses one of maybe a handful of extremely common data set, such as the [Iris flower data set](https://archive.ics.uci.edu/ml/datasets/iris), the [Titanic survivor data set](https://www.kaggle.com/c/titanic), the [Boston Housing data set](https://www.kaggle.com/c/boston-housing), the [MNIST handwritten digits](http://yann.lecun.com/exdb/mnist/), or the [IMDB Movie Reviews](http://ai.stanford.edu/~amaas/data/sentiment/). These data sets are so popular that they\'re used all over the place, over and over again, and even come delivered with some popular machine learning frameworks such as [Scikit-Learn](https://scikit-learn.org/stable/datasets/index.html) or [Keras](https://keras.io/api/datasets/). They are great for quickly benchmarking or proof-of-concepting a model, but let\'s face it, they\'ve become pretty boring to everyone who\'s been emersed in the world of machine learning for a while. There are so many other interesting data sets that are well suited for practicing machine learning problems that have gotten way less attention. Many of which one can find on websites like Kaggle(https://www.kaggle.com/) or the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). In this post I want to point out some interesting examples (the list is by no means comprehensive) I used for understanding some machine learning concepts.\n\nI will go through the following five topics/data sets:\n- [Regression on Automobile Model Import Data](#automobile)\n- [Ensemble methods for Pulsar Neutron Star Classification](#neutronstar)\n- [Dimensionality reduction on colonoscopy video data](#colonoscopy)\n- [Optimized deep neural networks learning poker hands](#poker)\n- [ResNet CNN for classifiying cats and dogs](#catdog)\n\n### Regression on Automobile Model Import Data \nLet\'s start with an old data set from 1985 Ward\'s Automotive Yearbook, which is a great example for a regression problem as it has continuous, ordinal, and categorical (nominal) features which can be used to predict the prices of car models imported into the USA in 1985. You can find the corresponding Jupyter Notebook and data [here](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository/tree/master/Regression-on-automobile-imports/).\n\nThe data set was contributed to the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Automobile) by Jeffrey C. Schlimmer, which is where I got it from.\n\n#### Frame the Problem\nLet\'s say we want to have a model that can predict the price of an import model car based on its features (e.g. fuel type, engine type, horse power, mileage etc.). Maybe we are just personally interested to get a car from abroad or we work for a car dealership that is specialized on imports - and of course we want to know what price to expect. Or maybe our model will be only a part in a pipeline of models for an insurance company that needs, among many other inputs, a price prediction to feed to the next model which is supposed to come up with insurance policies for new imports. Let\'s go with the latter example here.\n\nWe clearly have a multivariate regression problem at hand and data set with very few instances (and no new instances streaming in) which means it can easily be batch-processed. To evaluate the performance of our model, we will choose the coefficient of determination, *R2*, which is basically the proportion of the variance in our price predictions that is predictable from the feature (the closer it is to 1, the better).\n\n\n```python\n# import libraries \nimport numpy as np # numerical computation\nimport pandas as pd # data handling\n# visulalization\nimport matplotlib.pyplot as plt \nimport seaborn as sns \nsns.set_style("darkgrid")\n%matplotlib notebook\n```\n\n#### Data preparation\nLet\'s import the data into data frame hand have a look.\n\n\n```python\n# load data and show first 5 rows\nauto = pd.read_csv("imports-85.data",header=None,names=["symboling","normalized_losses","make","fuel_type","aspiration","num_of_doors","body_style","drive_wheels","engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_of_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"],na_values="?")\nauto.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
symbolingnormalized_lossesmakefuel_typeaspirationnum_of_doorsbody_styledrive_wheelsengine_locationwheel_base...engine_sizefuel_systemborestrokecompression_ratiohorsepowerpeak_rpmcity_mpghighway_mpgprice
03NaNalfa-romerogasstdtwoconvertiblerwdfront88.6...130mpfi3.472.689.0111.05000.0212713495.0
13NaNalfa-romerogasstdtwoconvertiblerwdfront88.6...130mpfi3.472.689.0111.05000.0212716500.0
21NaNalfa-romerogasstdtwohatchbackrwdfront94.5...152mpfi2.683.479.0154.05000.0192616500.0
32164.0audigasstdfoursedanfwdfront99.8...109mpfi3.193.4010.0102.05500.0243013950.0
42164.0audigasstdfoursedan4wdfront99.4...136mpfi3.193.408.0115.05500.0182217450.0
\n

5 rows \xd7 26 columns

\n
\n\n\n\n\n```python\n# print info\nauto.info()\n```\n\n \n RangeIndex: 205 entries, 0 to 204\n Data columns (total 26 columns):\n symboling 205 non-null int64\n normalized_losses 164 non-null float64\n make 205 non-null object\n fuel_type 205 non-null object\n aspiration 205 non-null object\n num_of_doors 203 non-null object\n body_style 205 non-null object\n drive_wheels 205 non-null object\n engine_location 205 non-null object\n wheel_base 205 non-null float64\n length 205 non-null float64\n width 205 non-null float64\n height 205 non-null float64\n curb_weight 205 non-null int64\n engine_type 205 non-null object\n num_of_cylinders 205 non-null object\n engine_size 205 non-null int64\n fuel_system 205 non-null object\n bore 201 non-null float64\n stroke 201 non-null float64\n compression_ratio 205 non-null float64\n horsepower 203 non-null float64\n peak_rpm 203 non-null float64\n city_mpg 205 non-null int64\n highway_mpg 205 non-null int64\n price 201 non-null float64\n dtypes: float64(11), int64(5), object(10)\n memory usage: 41.7+ KB\n \n\n\n```python\n# print describtion \nauto.describe()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
symbolingnormalized_losseswheel_baselengthwidthheightcurb_weightengine_sizeborestrokecompression_ratiohorsepowerpeak_rpmcity_mpghighway_mpgprice
count205.000000164.000000205.000000205.000000205.000000205.000000205.000000205.000000201.000000201.000000205.000000203.000000203.000000205.000000205.000000201.000000
mean0.834146122.00000098.756585174.04926865.90780553.7248782555.565854126.9073173.3297513.25542310.142537104.2561585125.36945825.21951230.75122013207.129353
std1.24530735.4421686.02177612.3372892.1452042.443522520.68020441.6426930.2735390.3167173.97204039.714369479.3345606.5421426.8864437947.066342
min-2.00000065.00000086.600000141.10000060.30000047.8000001488.00000061.0000002.5400002.0700007.00000048.0000004150.00000013.00000016.0000005118.000000
25%0.00000094.00000094.500000166.30000064.10000052.0000002145.00000097.0000003.1500003.1100008.60000070.0000004800.00000019.00000025.0000007775.000000
50%1.000000115.00000097.000000173.20000065.50000054.1000002414.000000120.0000003.3100003.2900009.00000095.0000005200.00000024.00000030.00000010295.000000
75%2.000000150.000000102.400000183.10000066.90000055.5000002935.000000141.0000003.5900003.4100009.400000116.0000005500.00000030.00000034.00000016500.000000
max3.000000256.000000120.900000208.10000072.30000059.8000004066.000000326.0000003.9400004.17000023.000000288.0000006600.00000049.00000054.00000045400.000000
\n
\n\n\n\nWe got 205 instances, 26 columns (of which 25 are feature and one is the price which we\'ll try to predict), data types float, int, and object show that we are dealing with continuous, categorical, and in this case also ordinal data. The feature "symboling" is used by actuarians to asses riskyness of an auto: A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe. We also see that there are missing values. We\'ll have to find a strategy to deal with these later.\n\nWe can have a look on how the numerical features are distributed by plotting them in histograms:\n\n\n```python\n# plot histograms\nauto.hist(bins=20)\nplt.tight_layout()\n```\n\n\n \n\n\n\n\n\n\nWe can see a couple of things in these histograms: \n- It does not look like any of the numerical features were clipped of at any maximum or minimum value (which is great so we won\'t have to fix that).\n- The features are on very different scales, so we will have to rescale them later on.\n- Some of the features are quite skewed, which can be a problem for some algorithms. We could try to transform them into a more normally distributed shape later.\n- The `compression_ratio` seems to cluster into a low-ratio and a smaller high-ratio group. Since the data set is quite small, we should probably stratify for that when we split off a test set to avoid introducing sampling bias.\n\nWe can now put aside a hold-out test set.\n\n\n```python\n# assign each instance a low (0) or high (1) compression_ratio category for stratification\nauto["cr_cat"] = pd.cut(auto.compression_ratio,2,labels=[0,1])\n\nfrom sklearn.model_selection import StratifiedShuffleSplit\n# put 20% of the data aside as a test set\nsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)\nfor train_index, test_index in split.split(auto, auto["cr_cat"]):\n auto_train_set = auto.loc[train_index]\n auto_test_set = auto.loc[test_index]\n \n# remove the cr_cat from the data again\nfor set in (auto_train_set, auto_test_set):\n set.drop(["cr_cat"], axis=1, inplace=True)\n```\n\nNow we put a test set aside to protect us from snooping, we will do some exploration on a copy of our training data. Let\'s look for some (Pearson) correlation first.\n\n\n```python\n# calculate correlations with price\nauto = auto_train_set.copy()\nauto.corr().price.sort_values(ascending=False)\n```\n\n\n\n\n price 1.000000\n engine_size 0.872527\n curb_weight 0.866972\n horsepower 0.808510\n width 0.791171\n length 0.736342\n wheel_base 0.648699\n bore 0.543084\n normalized_losses 0.189377\n height 0.147473\n stroke 0.076966\n compression_ratio 0.031453\n symboling -0.116180\n peak_rpm -0.120205\n city_mpg -0.697621\n highway_mpg -0.725255\n Name: price, dtype: float64\n\n\n\nWe can also do that graphically for a few examples with Pandas\' super cool scatter matrix plot:\n\n\n```python\n# plot a scatter matrix\nattributes = ["price","engine_size","compression_ratio","highway_mpg"]\npd.plotting.scatter_matrix(auto[attributes])\nplt.tight_layout()\n```\n\n\n \n\n\n\n\n\n\nIt\'s clear that e.g. engine size and price are positively correlated, compression ratio and price don\'t seem to be correlated much, and mileage and price are negatively correlated.\n\nAt this point we could think about creating some combined features, calculated from the ones we got, or adding some polynomial features (e.g. price vs. mileage doesn\'t look perfectly linear. But it\'s a good idea to keep it simple first and engineer more features later on if necessary.\n\nWe should now come up with a strategy for th missing values.\n\n\n```python\n# show sum of missing values per attribute\nauto.isnull().sum()\n```\n\n\n\n\n symboling 0\n normalized_losses 29\n make 0\n fuel_type 0\n aspiration 0\n num_of_doors 2\n body_style 0\n drive_wheels 0\n engine_location 0\n wheel_base 0\n length 0\n width 0\n height 0\n curb_weight 0\n engine_type 0\n num_of_cylinders 0\n engine_size 0\n fuel_system 0\n bore 4\n stroke 4\n compression_ratio 0\n horsepower 1\n peak_rpm 1\n city_mpg 0\n highway_mpg 0\n price 4\n dtype: int64\n\n\n\nThe instances with missing price are not useful for us so we will drop them. We will also separate the price (our labels) from the features.\n\n\n```python\n# drop rows with missing price\nauto.dropna(subset=["price"], inplace=True)\n# separate labels and features\nauto_labels = auto.price.copy()\nauto_feat = auto.drop(["price"], axis=1)\n```\n\nWe can drop the instances with the other missing values as well, but since we only have a few instances in this data set it might be a good idea to try to fill the missing values (e.g. with the feature\'s median for numeric features or its mode for categorical features) instead. We can treat this choice (dropping or filling of missing values) as a hyper-parameter for our model. For the missing `num_of_doors` we will fill in the majority value. For numerical features, Scikit-Learn offers an Imputer which can be used to fill the value (e.g. median) for the training set into missing values in the test set as well.\n\n\n```python\n# get the majority number of doors and use it to fill missing values\nnum_d = auto_feat.num_of_doors.value_counts().sort_values(ascending=False).index[0]\nauto_feat.num_of_doors.fillna(num_d, inplace=True)\n\n# import the imputer\nfrom sklearn.impute import SimpleImputer\nimputer = SimpleImputer(strategy="median")\n\n# extract the numerical features and fill their missing values\nauto_featnum = auto_feat[[c for c in auto_feat.columns if auto_feat[c].dtype!="O"]]\nimputer.fit(auto_featnum)\nauto_featnum_filled = pd.DataFrame(imputer.transform(auto_featnum),\n columns=auto_featnum.columns)\n```\n\nWe also have to scale the numerical features before as they are on very different scales. We\'ll used standardization here.\n\n\n```python\n# standardize numerical features\nfrom sklearn.preprocessing import StandardScaler\nscaler = StandardScaler()\nauto_featnum_scaled = scaler.fit_transform(auto_featnum_filled)\n```\n\nSince the algorithms won\'t be able to work with the categorical data directly, we\'ll have to encode the categorical features first. `num_of_door` and `num_of_cylinders` are ordinal (meaning that they have a natural order) but maybe not evenly spaced and the other categorical features\' values have no natural order so the simplest solution is to use One-Hot encoding for all categorical features.\n\n\n```python\n# import label binarizer for 1-hot encoding\nfrom sklearn.preprocessing import OneHotEncoder\nencoder = OneHotEncoder(sparse=False)\n# extract the categorical features\nauto_featcat = auto_feat.drop([c for c in auto_feat.columns if auto_feat[c].dtype!="O"],axis=1)\nauto_featcat_1hot = encoder.fit_transform(auto_featcat)\n```\n\nAt last, we can combine the features again to get one feature matrix.\n\n\n```python\n# recombine features\nauto_feat_prepro = np.concatenate([auto_featnum_scaled, auto_featcat_1hot],axis=1)\n```\n\nFinally done with the preprocessing! Or are we? We\'ve done everything manually so far but actually we can write that all up more efficiently by transformer for our custom transformations and put them in pipelines with Scikit-Learn transformers!\n\n\n```python\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.pipeline import FeatureUnion\n\n# a transformer class to select features from a data frame\nclass DataFrameSelector(BaseEstimator, TransformerMixin):\n def __init__(self, feature_names):\n self.feature_names = feature_names\n def fit(self, X, y=None):\n return self\n def transform(self, X):\n return X[self.feature_names].values\n\n# a transformer class to fill missing num_of_doors\nclass NumDoorsFill(BaseEstimator, TransformerMixin):\n def __init__(self, fill_na=True):\n self.fill_na = fill_na\n def fit(self, X, y=None):\n num_d = X.num_of_doors.value_counts().sort_values(ascending=False).index[0]\n return self\n def transform(self, X, y=None):\n if self.fill_na:\n X.num_of_doors.fillna(num_d,inplace=True)\n return X\n else:\n return X.dropna(subset=["num_of_doors"], inplace=True)\n \n# extract numerical and categorical feature names\nnum_feat = [c for c in auto_feat.columns if auto_feat[c].dtype!="O"]\ncat_feat = [c for c in auto_feat.columns if auto_feat[c].dtype=="O"]\n\n# pipeline for the numerical features\nnum_pipeline = Pipeline([\n (\'selector\', DataFrameSelector(num_feat)),\n (\'imputer\', SimpleImputer(strategy="median")),\n (\'std_scaler\', StandardScaler())])\n\n# pipeline for the categorical features\ncat_pipeline = Pipeline([\n (\'selector\', DataFrameSelector(cat_feat)),\n (\'1hot_encoder\', OneHotEncoder()),])\n\n# combination of both pipelines\nfull_pipeline = FeatureUnion(transformer_list=[\n ("num_pipeline", num_pipeline),\n ("cat_pipeline", cat_pipeline),])\n\nauto_feat_prepro = full_pipeline.fit_transform(auto_feat)\n\n```\n\nNow we\'re really done, let\'s train some models.\n\n#### Selecting and tuning a model\nBefore we get into fine tuning a model we will first spot-check a couple of commonly used algorithms with default parameters to pick the most promising one for further optimization. We\'ll choose some simple algorithms here (linear regression, K-neighbors regression, and support vector regression). If this was a real task we would probably try more algorithms and ensemble methods and short-list a couple of promising candidates.\n\n\n```python\n# import cross validation and some models (linear reg, KNN, SVR)\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.neighbors import KNeighborsRegressor\nfrom sklearn.svm import SVR\n\n# cross-evaluate a linear regression model\nlinreg = LinearRegression()\nlinreg_scores = cross_val_score(linreg, auto_feat_prepro, auto_labels,scoring="r2", cv=10)\n\n# cross-evaluate a K-Neighbors regression model\nknreg = KNeighborsRegressor()\nknreg_scores = cross_val_score(knreg, auto_feat_prepro, auto_labels,scoring="r2", cv=10)\n\n# cross-evaluate a suport vector regression model\nsvreg = SVR(kernel="linear",gamma="scale")\nsvreg_scores = cross_val_score(svreg, auto_feat_prepro, auto_labels,scoring="r2", cv=10)\n\nprint("Cross-evaluation score for {}: {}+/-{}".format("LR",linreg_scores.mean(),linreg_scores.std()))\nprint("Cross-evaluation score for {}: {}+/-{}".format("KNR",knreg_scores.mean(),knreg_scores.std()))\nprint("Cross-evaluation score for {}: {}+/-{}".format("SVR",svreg_scores.mean(),svreg_scores.std()))\n```\n\n Cross-evaluation score for LR: 0.8056458770764865+/-0.1194569502992223\n Cross-evaluation score for KNR: 0.7455563483057739+/-0.16928604290397686\n Cross-evaluation score for SVR: 0.060918559892998804+/-0.15163455291086678\n \n\nLooks like the linear regression looks most promising in this example. We will pick this model for further fine-tuning. The standard linear regression, however, does not have any hyper-parameters to tune, but we can introduce different kinds of regularization approaches (Lasso and Ridge).\n\n\n```python\n# import grid search and regularized linear models\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.linear_model import Lasso\nfrom sklearn.linear_model import Ridge\n\n# values for the regularization parameter alpha\nparam_grid = {"alpha": [0.01,0.1,1,10,100]}\n\n# lasso-regularized linear regression\ngrid_lasso = GridSearchCV(Lasso(max_iter=100000), param_grid=param_grid, cv=10,scoring="r2")\n_ = grid_lasso.fit(auto_feat_prepro, auto_labels)\n\n# ridge-regularized linear regression\ngrid_ridge = GridSearchCV(Ridge(max_iter=100000), param_grid=param_grid, cv=10,scoring="r2")\n_ = grid_ridge.fit(auto_feat_prepro, auto_labels)\n\n# print results\nprint("Best score for lasso regression:\\n {}+/-{} for {}".format(grid_lasso.best_score_,grid_lasso.cv_results_["std_test_score"][grid_lasso.best_index_],grid_lasso.best_params_))\nprint("Best score for ridge regression:\\n {}+/-{} for {}".format(grid_ridge.best_score_,grid_ridge.cv_results_["std_test_score"][grid_ridge.best_index_],grid_ridge.best_params_))\n```\n\n Best score for lasso regression:\n 0.8625877052058882+/-0.07134551312938564 for {\'alpha\': 10}\n Best score for ridge regression:\n 0.866055991114397+/-0.06601838158702997 for {\'alpha\': 1}\n \n\nCool, we managed to improve the *R2* cross-validation score of our model by 6% by using regularization. Now we can finally tackle the test set!\n\n\n\n```python\nfrom sklearn.metrics import r2_score\n\n# get the test set\ntest = auto_test_set.copy()\n\n# drop rows with missing price\ntest.dropna(subset=["price"], inplace=True)\n\n# separate labels and features\ntest_labels = test.price.copy()\ntest_feat = test.drop(["price"], axis=1)\n\n# preprocess the test set\ntest_feat_prepro = full_pipeline.fit_transform(test_feat)\n\n# fit the ridge model\nridge_regression = Ridge(alpha=1).fit(test_feat_prepro, test_labels)\n\n# make predictions for the test set\npredictions = ridge_regression.predict(test_feat_prepro)\n\n# evaluate the model\ntest_score = r2_score(test_labels,predictions)\nprint("Test score: "+str(test_score))\n```\n\n Test score: 0.9843494555106602\n \n\nWow, that turned out exceptionally good. Such an improvement from cross-validation to test score is unusual; in fact, the test score is usually lower. We were probably lucky with the few examples that ended up in our test set. The whole data set is fairly small after all. We could check some more models now, test some more feature modifications or add additional features (e.g. different degrees of polynomial features) but I think everything so far serves as a good example already.\n\n### Ensemble methods for Pulsar Neutron Star Classification \nAs a former scientist I love pretty much everything in physics and astronomy has always interested me due to the extreme conditions being studied. Some of the most fascinating objects found in space are Pulsars, a special kind of neutron stars. You can find the Jupyter Notebook and data [here](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository/tree/master/Ensembles-for-neutron-stars). We can read in the description accompanying this data set: \n\n"Pulsars are a rare type of Neutron star that produce radio emission detectable here on Earth. \\[...\\] As pulsars rotate, their emission beam sweeps across the sky, and when this crosses our line of sight, produces a detectable pattern of broadband radio emission. As pulsars\nrotate rapidly, this pattern repeats periodically. Thus pulsar search involves looking for periodic radio signals with large radio telescopes." \n\nThis data set contains events of radio signal detection representing Pulsar candidates, which were recorded during the [High Time Resolution Universe Survey 1](https://academic.oup.com/mnras/article/409/2/619/1037409). It was contributed to the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/HTRU2#) by Robert Lyon from the University of Manchester\'s School of Physics and Astronomy.\n\n#### Frame the Problem\n\nTo detect the Pulsars, the signal is recorded over many rotations of the pulsar and the integrated pulse profile is used along with the DM-SNR curve (dispersion measure - signal to noise ratio curve, see [here](http://www.scienceguyrob.com/wp-content/uploads/2016/12/WhyArePulsarsHardToFind_Lyon_2016.pdf)).\nHowever, Pulsars are very rare and many spurious events caused by radio frequency interference (RFI) get detected, which makes it hard to find legitimate signals. It is very time-intensive and costly to have every record checked by human experts, therefore, machine learning could be of great aid to Pulsar detection.\n\nWe have a two-class classification problem at hand (Pulsar or not Pulsar) and we are most likely dealing with a high class imbalance because Pulsars are very rare. Because of this imbalance, we should not choose accuracy as a metric to evaluate the performance of our model. It is probably not a catastrophe if human experts will still find some false positives which they have to filter out manually, but we\'d be really sad if me miss a rare Pulsar (false negative). Hence, we should optimize for recall to minimize the false negative ratio.\n\n\n```python\n# import libraries \nimport numpy as np # numerical computation\nimport pandas as pd # data handling\nimport warnings\nwarnings.filterwarnings(\'ignore\')\n# visulalization\nimport matplotlib.pyplot as plt \nimport seaborn as sns \nsns.set_style("darkgrid")\n%matplotlib notebook\n```\n\n#### Data preparation\nLet\'s import the data into data frame hand have a look.\n\n\n```python\n# load data and show first 5 rows\n# the columns are:\n# 1. Mean of the integrated profile.\n# 2. Standard deviation of the integrated profile.\n# 3. Excess kurtosis of the integrated profile.\n# 4. Skewness of the integrated profile.\n# 5. Mean of the DM-SNR curve.\n# 6. Standard deviation of the DM-SNR curve.\n# 7. Excess kurtosis of the DM-SNR curve.\n# 8. Skewness of the DM-SNR curve.\n# 9. Class (1 = Pulsar, 0 = no Pulsar)\n\npulsar = pd.read_csv("HTRU_2.csv",header=None,names=["ip_mean","ip_std","ip_kurt","ip_skew","dmsnr_mean","dmsnr_std","dmsnr_kurt","dmsnr_skew","label"])\npulsar.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ip_meanip_stdip_kurtip_skewdmsnr_meandmsnr_stddmsnr_kurtdmsnr_skewlabel
0140.56250055.683782-0.234571-0.6996483.19983319.1104267.97553274.2422250
1102.50781258.8824300.465318-0.5150881.67725814.86014610.576487127.3935800
2103.01562539.3416490.3233281.0511643.12123721.7446697.73582263.1719090
3136.75000057.178449-0.068415-0.6362383.64297720.9592806.89649953.5936610
488.72656240.6722250.6008661.1234921.17893011.46872014.269573252.5673060
\n
\n\n\n\n\n```python\n# print info\npulsar.info()\n```\n\n \n RangeIndex: 17898 entries, 0 to 17897\n Data columns (total 9 columns):\n ip_mean 17898 non-null float64\n ip_std 17898 non-null float64\n ip_kurt 17898 non-null float64\n ip_skew 17898 non-null float64\n dmsnr_mean 17898 non-null float64\n dmsnr_std 17898 non-null float64\n dmsnr_kurt 17898 non-null float64\n dmsnr_skew 17898 non-null float64\n label 17898 non-null int64\n dtypes: float64(8), int64(1)\n memory usage: 1.2 MB\n \n\n\n```python\n# print describtion \npulsar.describe()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ip_meanip_stdip_kurtip_skewdmsnr_meandmsnr_stddmsnr_kurtdmsnr_skewlabel
count17898.00000017898.00000017898.00000017898.00000017898.00000017898.00000017898.00000017898.00000017898.000000
mean111.07996846.5495320.4778571.77027912.61440026.3265158.303556104.8577090.091574
std25.6529356.8431891.0640406.16791329.47289719.4705724.506092106.5145400.288432
min5.81250024.772042-1.876011-1.7918860.2132117.370432-3.139270-1.9769760.000000
25%100.92968842.3760180.027098-0.1885721.92307714.4373325.78150634.9605040.000000
50%115.07812546.9474790.2232400.1987102.80183918.4613168.43351583.0645560.000000
75%127.08593851.0232020.4733250.9277835.46425628.42810410.702959139.3093310.000000
max192.61718898.7789118.06952268.101622223.392140110.64221134.5398441191.0008371.000000
\n
\n\n\n\n\n```python\n# look for missing values\npulsar.isna().sum()\n```\n\n\n\n\n ip_mean 0\n ip_std 0\n ip_kurt 0\n ip_skew 0\n dmsnr_mean 0\n dmsnr_std 0\n dmsnr_kurt 0\n dmsnr_skew 0\n label 0\n dtype: int64\n\n\n\n\n```python\n# look at class distribution\npulsar.label.value_counts()\n```\n\n\n\n\n 0 16259\n 1 1639\n Name: label, dtype: int64\n\n\n\nWe have 17898 instances of which only 9.2 % correspond to the positive class. The 8 numerical features correspond to simple statistics (mean, standard deviation, excess kurtosis, and skewness) of the radio signals\' integrated profile and DM-SNR curve.\n\nWe can have a look on how the features are distributed by plotting them in histograms:\n\n\n```python\n# plot histograms\npulsar.hist()\nplt.tight_layout()\n```\n\n\n\n\n\nWe can see a couple of things in these histograms: \n- It does not look like any of the features were clipped of at any maximum or minimum value (which is great so we won\'t have to fix that).\n- The features are on very different scales, so we will have to rescale them later on.\n- Some of the features are very skewed or have outliers, which can be a problem for some algorithms and scaling transformations. We could try to transform them into a more Gaussian shape.\n\nWe can now put aside a hold-out test set. Or actually, since I want to work with ensemble methods in this example, I will also split the train set into two subsets. One will be used to train and tune different classifiers and the second one will be used to train another classifier in a second layer which will blend the outputs of the first layer of classifiers. If we are lucky, our blended ensemble will work better than the individual classifiers.\n\n\n```python\nfrom sklearn.model_selection import StratifiedShuffleSplit\n# put 20% of the data aside as a test set\nsplit1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)\nfor train_index, test_index in split1.split(pulsar, pulsar["label"]):\n pulsar_train = pulsar.loc[train_index]\n pulsar_test = pulsar.loc[test_index]\n X_test = pulsar_test.loc[:,"ip_mean":"dmsnr_skew"]\n y_test = pulsar_test.loc[:,"label"]\n \n \n# put 25% of the training data aside for the blender layer 2\npulsar_train = pulsar_train.reset_index(drop=True)\nsplit2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=1234)\nfor train_index, test_index in split2.split(pulsar_train, pulsar_train["label"]):\n pulsar_layer1 = pulsar_train.reindex().loc[train_index]\n X_layer1 = pulsar_layer1.loc[:,"ip_mean":"dmsnr_skew"]\n y_layer1 = pulsar_layer1.loc[:,"label"]\n \n pulsar_layer2 = pulsar_train.reindex().loc[test_index]\n X_layer2 = pulsar_layer2.loc[:,"ip_mean":"dmsnr_skew"]\n y_layer2 = pulsar_layer2.loc[:,"label"]\n\n```\n\nLet\'s see if we can bring the data in a more Gaussian shape with Scikit-Learn\'s `PowerTransformer`. If there will still be strong outliers which would cause problems we could also try a `QuantileTransformer` which would force all values in a similar range and still produce a kind of Gaussian shaped distribution. At this point I have no idea which may work better so we\'ll just treat it as a hyper-parameter and try both. Let\'s write a transformer class for it.\n\n\n```python\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.preprocessing import PowerTransformer\nfrom sklearn.preprocessing import QuantileTransformer\n\n# a transformer class to select either PowerTranformer or gaussian QuantileTransformer\nclass PQTransformer(BaseEstimator, TransformerMixin):\n def __init__(self, trans="power"):\n self.trans = trans\n self.pt = PowerTransformer()\n self.qt = QuantileTransformer(output_distribution="normal")\n \n def fit(self, X, y=None):\n self.pt.fit(X)\n self.qt.fit(X)\n return self\n \n def transform(self, X):\n if self.trans == "power":\n return self.pt.transform(X)\n elif self.trans == "quantile":\n return self.qt.transform(X)\n else:\n return None\n\n```\n\nLet\'s have a look at the feature distribution after both transform options:\n\n\n```python\n# use a PowerTransformer\ntransformer = PQTransformer(trans="power")\nX_layer1_trans = pd.DataFrame(transformer.fit_transform(X_layer1),\n columns=X_layer1.columns)\nX_layer1_trans.plot(kind="box")\nplt.tight_layout()\n```\n\n\n\n\n\n\n```python\n# use a Gaussian QuantileTransformer\ntransformer = PQTransformer(trans="quantile")\nX_layer1_trans = pd.DataFrame(transformer.fit_transform(X_layer1),\n columns=X_layer1.columns)\nX_layer1_trans.plot(kind="box")\nplt.tight_layout()\n```\n\n\n\n\n\nThe transformed data looks as expected; after the `PowerTransformer`, the data is still on quite different scales due to the outlying values. The `QuantileTransformer` brings everything on the same range but introduced [saturation artifacts](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py) for extreme values. We\'ll see what works better.\n\nSince all the 8 features come from only two types of measurements, they are most likely correlated in some way. Let\'s have a look at the correlation matrix.\n\n\n```python\n# plot correlation matrix\nsns.heatmap(X_layer1_trans.corr(),annot=True, cmap=plt.cm.seismic)\n```\n\n\n\n\n \n\n\n\n\n\n\n\nWe can try to incorporate feature interaction by adding polynomial features, treating the degree as a hyper-parameter. However we should actually create these features before we do our power/quantile transform so we\'ll restructure the work flow a bit.\n\n\n```python\n# put polynomial features and Transformer into a pipeline\nfrom sklearn.preprocessing import PolynomialFeatures\nfrom sklearn.pipeline import Pipeline\npreprocess = Pipeline([\n (\'poly\', PolynomialFeatures()),\n (\'trans\', PQTransformer())])\n```\n\n#### Selecting and tuning models\nTime to train some classifiers on the data. We\'ll start with a simple logistic regression to get somewhat of a benchmark.\n\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import GridSearchCV\n\n# set up a pipeline with the preprocessing and a classifier \nlr_clf = Pipeline([\n (\'prep\', preprocess),\n (\'lr\', LogisticRegression(class_weight="balanced",\n solver="lbfgs",\n random_state=1234))]) # acount for class imbalance\n\n# perform a grid search cross evaluation of the paramters below\nparam_grid = {\n \'prep__poly__degree\': [1,2,3],\n \'prep__trans__trans\': ["power","quantile"],\n \'lr__C\': [0.01,0.1,1,10],\n}\n\nlr_grid = GridSearchCV(lr_clf, param_grid, scoring="recall",iid=False, cv=5,n_jobs=-1)\nlr_grid.fit(X_layer1, y_layer1)\nprint("Best parameter (CV recall score=%0.3f):" % lr_grid.best_score_)\nprint(lr_grid.best_params_)\n\n```\n\n Best parameter (CV recall score=0.927):\n {\'lr__C\': 1, \'prep__poly__degree\': 1, \'prep__trans__trans\': \'quantile\'}\n \n\n\n```python\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\nfrom sklearn.metrics import accuracy_score\n# let\'s do a prediction for the layer 2 test set\ny_pred = lr_grid.predict(X_layer2)\nprint("Logistic regression on layer 2 test set.\\nConfusion matrix:")\nprint(confusion_matrix(y_layer2, y_pred))\nprint("Recall: {:0.3f}".format(recall_score(y_layer2, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_layer2, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_layer2, y_pred)))\n```\n\n Logistic regression on layer 2 test set.\n Confusion matrix:\n [[3076 176]\n [ 27 301]]\n Recall: 0.918\n Precision: 0.631\n Accuracy: 0.943\n \n\nThat already looks pretty okay. Since we were optimizing for recall the precision is lousy but thats probably okay - rather have humans dig through some spurious signals than missing a real one. But unfortunately we\'re still missing a few Pulsars (false negatives) here, so let\'s see if we can do better with applying ensemble methods. One of the most popular algorithms in that field is random forest. It basically is an ensemble of decision trees of which each just deals with a subset of features and a bootstrapped subset of training instances. The prediction of all these simple trees is then combined to come up with a final prediction of the whole forest. The forest can be regularized by regularizing its trees (e.g. limiting number of leave nodes or depth).\n\n\n```python\nfrom sklearn.ensemble import RandomForestClassifier\n\n# set up a pipeline with the preprocessing and a classifier \nrf_clf = Pipeline([\n (\'prep\', preprocess),\n (\'rf\', RandomForestClassifier(class_weight="balanced_subsample",\n random_state=1234,\n n_jobs=-1))]) # acount for class imbalance\n\n# perform a grid search cross evaluation of the paramters below\nparam_grid = {\n \'prep__poly__degree\': [1,2,3],\n \'prep__trans__trans\': ["power","quantile"],\n \'rf__n_estimators\': [100,300,500],\n \'rf__max_leaf_nodes\': [5,10,20],\n}\n\nrf_grid = GridSearchCV(rf_clf, param_grid, scoring="recall",iid=False, cv=5,n_jobs=-1)\nrf_grid.fit(X_layer1, y_layer1)\nprint("Best parameter (CV recall score=%0.3f):" % rf_grid.best_score_)\nprint(rf_grid.best_params_)\n```\n\n Best parameter (CV recall score=0.910):\n {\'prep__poly__degree\': 3, \'prep__trans__trans\': \'power\', \'rf__max_leaf_nodes\': 10, \'rf__n_estimators\': 100}\n \n\n\n```python\n# we reached the border of our n_estimator parameter, so lets do another grid search around that parameter\nrf2_clf = Pipeline([\n (\'prep\', preprocess),\n (\'rf\', RandomForestClassifier(class_weight="balanced_subsample",\n random_state=1234,\n n_jobs=-1))]) # acount for class imbalance\nparam_grid = {\n \'prep__poly__degree\': [3],\n \'prep__trans__trans\': ["power"],\n \'rf__n_estimators\': [50,100],\n \'rf__max_leaf_nodes\': [4,6,10],\n}\n\nrf2_grid = GridSearchCV(rf2_clf, param_grid, scoring="recall",iid=False, cv=5,n_jobs=-1)\nrf2_grid.fit(X_layer1, y_layer1)\nprint("Best parameter (CV recall score=%0.3f):" % rf2_grid.best_score_)\nprint(rf2_grid.best_params_)\n```\n\n Best parameter (CV recall score=0.910):\n {\'prep__poly__degree\': 3, \'prep__trans__trans\': \'power\', \'rf__max_leaf_nodes\': 6, \'rf__n_estimators\': 100}\n \n\n\n```python\n# let\'s do a prediction for the layer 2 test set\ny_pred = rf2_grid.predict(X_layer2)\nprint("Random forrest on layer 2 test set.\\nConfusion matrix:")\nprint(confusion_matrix(y_layer2, y_pred))\nprint("Recall: {:0.3f}".format(recall_score(y_layer2, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_layer2, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_layer2, y_pred)))\n```\n\n Random forrest on layer 2 test set.\n Confusion matrix:\n [[3181 71]\n [ 33 295]]\n Recall: 0.899\n Precision: 0.806\n Accuracy: 0.971\n \n\nLooks like the forest actually performed (a little bit) worse than just the logistic regression. Maybe we can try another kind of ensemble, e.g. AdaBoost. In this ensemble, we subsequently train weak learners and each learner will try to correct the errors that its predecessor makes. We could use pretty much any base estimator, but let\'s stick to trees. Or rather stumps because we won\'t allow them to perform more than one split per estimator. \n\n\n```python\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.tree import DecisionTreeClassifier\n\n# set up a pipeline with the preprocessing and a classifier \nab_clf = Pipeline([\n (\'prep\', preprocess),\n (\'ab\', AdaBoostClassifier(\n base_estimator=DecisionTreeClassifier(max_depth=1,\n class_weight="balanced"),\n algorithm="SAMME.R",\n random_state=1234))]) # acount for class imbalance\n\n# perform a grid search cross evaluation of the paramters below\nparam_grid = {\n \'prep__poly__degree\': [1,2,3],\n \'prep__trans__trans\': ["power","quantile"],\n \'ab__n_estimators\': [10,50,100],\n \'ab__learning_rate\': [0.5,1.0,1.5],\n}\n\nab_grid = GridSearchCV(ab_clf, param_grid, scoring="recall",iid=False, cv=5,n_jobs=-1)\nab_grid.fit(X_layer1, y_layer1)\nprint("Best parameter (CV recall score=%0.3f):" % ab_grid.best_score_)\nprint(ab_grid.best_params_)\n```\n\n Best parameter (CV recall score=0.919):\n {\'ab__learning_rate\': 0.5, \'ab__n_estimators\': 10, \'prep__poly__degree\': 3, \'prep__trans__trans\': \'power\'}\n \n\n\n```python\n# we reached the border of our n_estimator parameter, so lets do another grid search around that parameter\nab2_clf = Pipeline([\n (\'prep\', preprocess),\n (\'ab\', AdaBoostClassifier(\n base_estimator=DecisionTreeClassifier(max_depth=1,\n class_weight="balanced"),\n algorithm="SAMME.R",\n random_state=1234))]) # acount for class imbalance\n\nparam_grid = {\n \'prep__poly__degree\': [3],\n \'prep__trans__trans\': ["power"],\n \'ab__n_estimators\': [1,2,3,10],\n \'ab__learning_rate\': [0.1,0.3,0.5],\n}\n\nab2_grid = GridSearchCV(ab2_clf, param_grid, scoring="recall",iid=False, cv=5,n_jobs=-1)\nab2_grid.fit(X_layer1, y_layer1)\nprint("Best parameter (CV recall score=%0.3f):" % ab2_grid.best_score_)\nprint(ab2_grid.best_params_)\n```\n\n Best parameter (CV recall score=0.924):\n {\'ab__learning_rate\': 0.3, \'ab__n_estimators\': 2, \'prep__poly__degree\': 3, \'prep__trans__trans\': \'power\'}\n \n\n\n```python\n# let\'s do a prediction for the layer 2 test set\ny_pred = ab2_grid.predict(X_layer2)\nprint("Random forrest on layer 2 test set.\\nConfusion matrix:")\nprint(confusion_matrix(y_layer2, y_pred))\nprint("Recall: {:0.3f}".format(recall_score(y_layer2, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_layer2, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_layer2, y_pred)))\n```\n\n Random forrest on layer 2 test set.\n Confusion matrix:\n [[3095 157]\n [ 24 304]]\n Recall: 0.927\n Precision: 0.659\n Accuracy: 0.949\n \n\nNice, we minimally improved the performance compared to the logistic regression. Now how about if we could stuff our ensembles into a bigger ensemble? Each of the classifiers we have trained and tuned so far can predict a probability for a class. So we could take into account all these probabilities to find out which class is considered most likely by all the classifiers combined. Scikit-Learn offers a `VotingClassifier` for this purpose. \n\n\n```python\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom sklearn.tree import DecisionTreeClassifier\n\n\nfrom sklearn.ensemble import VotingClassifier\n# set up the preprep pipes with the classifiers from earlier with the optimized parameters\n\n# preprosessing + logistic regression\nlr_pipe = Pipeline([(\'poly\',PolynomialFeatures(degree=3)),\n (\'trans\',PQTransformer(trans="quantile")),\n (\'lr\', LogisticRegression(C=1,\n class_weight="balanced",\n solver="lbfgs",\n max_iter=100000,\n random_state=1234))])\n# preprosessing + random forest\nrf_pipe = Pipeline([(\'poly\',PolynomialFeatures(degree=3)),\n (\'power\',PQTransformer(trans="power")),\n (\'rf\', RandomForestClassifier(n_estimators=100,\n max_leaf_nodes=6,\n class_weight="balanced_subsample",\n random_state=1234,\n n_jobs=-1))])\n# preprosessing + AdaBoost\nab_pipe = Pipeline([(\'poly\',PolynomialFeatures(degree=3)),\n (\'power\',PQTransformer(trans="power")),\n (\'ab\', AdaBoostClassifier(\n base_estimator=DecisionTreeClassifier(max_depth=1,\n class_weight="balanced"),\n learning_rate=0.3,\n n_estimators=2,\n algorithm="SAMME.R",\n random_state=1234))])\n\n\n# combine everything in one voting classifier\nvote_clf = VotingClassifier(estimators=[(\'lr\', lr_pipe), (\'rf\', rf_pipe), (\'ab\', ab_pipe)],\n voting="soft", n_jobs=-1)\n_ = vote_clf.fit(X=X_layer1,y=y_layer1)\n```\n\n\n```python\n# let\'s do a prediction for the layer 2 test set\ny_pred = vote_clf.predict(X_layer2)\nprint("Voting classifier on layer 2 test set.\\nConfusion matrix:")\nprint(confusion_matrix(y_layer2, y_pred))\nprint("Recall: {:0.3f}".format(recall_score(y_layer2, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_layer2, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_layer2, y_pred)))\n```\n\n Voting classifier on layer 2 test set.\n Confusion matrix:\n [[3166 86]\n [ 28 300]]\n Recall: 0.915\n Precision: 0.777\n Accuracy: 0.968\n \n\nAs we can see, combining the classifiers into a voting ensemble did not improv the performance. Let\'s have a look at which examples are being missclassifed.\n\n\n```python\n# train all the classifierers and get the false negatives from the prediction\nlr_pipe.fit(X_layer1,y_layer1)\nlr_y_pred = lr_pipe.predict(X_layer2)\nlr_fn = [ i for i,x in enumerate(zip(y_layer2,lr_y_pred)) if x[0]!=x[1] and x[0]==1]\n\nrf_pipe.fit(X_layer1,y_layer1)\nrf_y_pred = rf_pipe.predict(X_layer2)\nrf_fn = [ i for i,x in enumerate(zip(y_layer2,rf_y_pred)) if x[0]!=x[1] and x[0]==1]\n\nab_pipe.fit(X_layer1,y_layer1)\nab_y_pred = ab_pipe.predict(X_layer2)\nab_fn = [ i for i,x in enumerate(zip(y_layer2,rf_y_pred)) if x[0]!=x[1] and x[0]==1]\n\nvote_clf.fit(X_layer1,y_layer1)\nvote_y_pred = vote_clf.predict(X_layer2)\nvote_fn = [ i for i,x in enumerate(zip(y_layer2,rf_y_pred)) if x[0]!=x[1] and x[0]==1]\n\n# count how often each false negative got missclassified\nprint(pd.Series(lr_fn+ab_fn+rf_fn+vote_fn).value_counts())\n```\n\n 2805 4\n 2960 4\n 2062 4\n 2333 4\n 554 4\n 1581 4\n 1590 4\n 2879 4\n 1603 4\n 326 4\n 3416 4\n 122 4\n 2234 4\n 134 4\n 244 4\n 140 4\n 2548 4\n 1222 4\n 236 4\n 230 4\n 3342 4\n 3061 4\n 3019 4\n 3530 4\n 1793 4\n 186 4\n 432 4\n 686 4\n 2141 3\n 3291 3\n 2225 3\n 2784 3\n 795 3\n 1846 1\n 2673 1\n 383 1\n 596 1\n dtype: int64\n \n\nLooks like almost all false negatives got missclassified by all classifiers.\n\nWe will now try to blend the output of all these classifiers into another classifer. Maybe there\'s a pattern in why all these false negatives get missclassified and maybe a second layer classifier will be abel to pick up this pattern and correct it.\n\nWe will use our layer 2 set to train the bleder layer but first, we\'ll have to send it through layer 1 (which was trained on the layer 1 set) and predict each instances class probability, which will be used as feature for layer 2. Note that when we are ust dealing with the probability for a class, we cannot optimize for recall because one can simply categorize every instance as a Pulsar - then there would be no false negatives but a loooot of false positives and the whole task wouldn\'t make any sense. Therefore, we will use the F1 score, which is a mix of recall and precision. It is better suited than accuracy for strong class imbalances like in this case.\n\n\n```python\n# we\'ll predict probability for the positive class using all calssifiers\ndef blender_features(X_layer2):\n lr_y_prob = lr_pipe.predict_proba(X_layer2)[:,1]\n rf_y_prob = rf_pipe.predict_proba(X_layer2)[:,1]\n ab_y_prob = ab_pipe.predict_proba(X_layer2)[:,1]\n vote_y_prob = vote_clf.predict_proba(X_layer2)[:,1]\n# and these probabilities will be the new features for our second blender layer\n return np.c_[lr_y_prob,rf_y_prob,ab_y_prob,vote_y_prob]\n```\n\n\n```python\n# do a grid search on a logistic regression classifier which serves as the blender\nX_blend = blender_features(X_layer2)\n\nparam_grid = {"C":[1e-10,1e-5,1e-5,1e-2,1]}\nblender_grid = GridSearchCV(LogisticRegression(class_weight="balanced",\n solver="lbfgs",\n random_state=1234),\n param_grid=param_grid, scoring="f1",\n iid=False, cv=5,n_jobs=-1)\n\nblender_grid.fit(X_blend,y_layer2)\nprint("Best parameter (CV F1 score=%0.3f):" % blender_grid.best_score_)\nprint(blender_grid.best_params_)\n```\n\n Best parameter (CV F1 score=0.839):\n {\'C\': 1e-05}\n \n\n\n```python\n# let\'s do a the final test prediction for the actual test set\nX_blend_test = blender_features(X_test)\ny_pred = blender_grid.predict(X_blend_test)\nprint("Blender classifier on final test set.\\nConfusion matrix:")\nprint(confusion_matrix(y_test, y_pred))\nprint("Recall: {:0.3f}".format(recall_score(y_test, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_test, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))\n```\n\n Blender classifier on final test set.\n Confusion matrix:\n [[3167 85]\n [ 30 298]]\n Recall: 0.909\n Precision: 0.778\n Accuracy: 0.968\n \n\nOkay, that\'s a little disappointing. Not much of an improvement here. Looks like all the fancy ensembles we tried are not performing much better than the first simple logistic regression we used as a benchmark. \n\nThat probably means that we do not just deal with random noise that may cause trouble for some types of classifiers, and which we can try to cancel out by using ensembles. All classifiers seemed to misclassify the same instances. Probably there is something systematically different about the misclassified false negative Pulsars, something that makes them look more like the spurious signals in the data set. \n\nIf we really wanted to solve this problem we should investigate these examples in detail now, find out what is special about them, and engineer this into features which a classification algorithm can work with in a better way. But hey, this was just supposed to be an example for demonstrating ensemble methods; hence, let\'s call it a day and end here.\n\n### Dimensionality reduction on colonoscopy video data \n\nMachine learning is becoming more successful in many medical applications, especially in analyzing image or video data. The field of proctology is no exception. I find this example particularly interesting as one of my best friends has to get regular colonoscopies as a measure of colon cancer prophylactics and I\'d like to tell her if machine learning can assist the doctors in classifying whatever they find in her behind as benign or malignant. You can find the Jupyter Notebook and data [here](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository/tree/master/Dimensionality-reduction-for-colonoscopy-data).\n\nThis data set contains 76 instances of gastrointestinal lesions, two malignant and one benign type, from regular colonoscopy data. It was contributed to the [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Gastrointestinal+Lesions+in+Regular+Colonoscopy) by Pablo Mesejo and Daniel Pizarro. You can find a detailed description on the website. They also provided a summary of how human experts performed on classifying the examples.\n\n#### Frame the Problem\nIn this example, we want to classify each lesions as either serrated adenomas, adenoma, or hyperplastic lesion. The first two are considered malignant, the latter as benign. Hence, we can also treat it as a binary classification problem (the authors of the data set labeled this binary case as *resection* vs. *no-resection*). \n\nWhat is special about this data set is that we are dealing with a lot of features but only a few instances. From each video, there are 422 2D textural features, 76 2D color features, 200 3D shape features, and each of those recorded under two different light conditions. A total of 1397 features and only 76 instances. Most classification algorithms will have problems with this ratio. Support vector machines are known for being able to deal with many features given few instances, but this ratio might be too extreme even for them. So what can we do about it? \n\nIn this example we will try to apply principal component analysis (PCA) to reduce the dimensionality of the feature space while still preserving much information of the original data. PCA will project the original high-dimensional data on a lower dimensional hyperplane (which is spanned by the principal components). We can either specify a number of dimensions and pick the PCs that preserve the most variance or we specify ho much variance should be preserved and pick the number of PCs accordingly. \n\nI would usually chose recall as optimization metric because I think it is important to not miss potential cancer; but since we want to compare to human performance (who probably want to get all classifications right) and we have a class imbalance, I\'ll choose the F1 score.\n\n\n```python\n# import libraries \nimport numpy as np # numerical computation\nimport pandas as pd # data handling\nimport warnings\nwarnings.filterwarnings(\'ignore\')\n# visulalization\nimport matplotlib.pyplot as plt \nimport seaborn as sns \nsns.set_style("darkgrid")\n%matplotlib notebook\n```\n\n#### Data preparation\nLet\'s import the data into data frame hand have a look.\n\n\n```python\ncolon = pd.read_csv("data.txt")\ncolon.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
adenoma_1adenoma_1.1adenoma_8adenoma_8.1adenoma_9adenoma_9.1adenoma_10adenoma_10.1adenoma_11adenoma_11.1...serrated_5serrated_5.1serrated_6serrated_6.1serrated_7serrated_7.1serrated_8serrated_8.1serrated_9serrated_9.1
03.0000003.0000003.0000003.0000003.0000003.0000003.0000003.0000003.0000003.000000...2.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.000000
11.0000002.0000002.0000001.0000002.0000001.0000001.0000002.0000002.0000001.000000...1.0000002.0000001.0000002.0000001.0000002.0000002.0000001.0000001.0000002.000000
2138.120000127.99000080.41500090.896000106.160000147.090000148.730000126.050000109.130000129.700000...114.59000086.424000163.68000071.638000180.110000136.55000096.852000157.81000093.56900095.543000
31606.8000003377.9000001852.1000001904.3000001184.400000822.3200002412.5000004752.200000999.390000599.950000...3014.5000003500.9000003253.1000001822.2000001198.5000001316.3000002071.3000002732.3000001163.6000002240.500000
40.0038750.0035640.0047610.0041470.0055180.0038710.0033360.0041880.0055410.005917...0.0044440.0034090.0048690.0041480.0032730.0024420.0043790.0040150.0021990.004803
\n

5 rows \xd7 152 columns

\n
\n\n\n\nIn this raw format, each column represents an instance measured at one of the two different light conditions, and the rows represents a features. The first row is the class label and the second row the light condition (1 or 2). We\'ll first have to merge the two light conditions per instance and than transpose the data frame so that we have the data in a [tidy format](https://vita.had.co.nz/papers/tidy-data.pdf#targetText=This%20paper%20tackles%20a%20small,observational%20unit%20is%20a%20table.).\n\n\n```python\n# get the column names for light conditions 1 and 2\nlight1cols = colon.loc[1,colon.iloc[1]==1].index\nlight2cols = colon.loc[1,colon.iloc[1]==2].index\n# create seperate data frames for the two conditions\nlight1 = colon[light1cols]\nlight2 = colon[light2cols]\n# give them the same column names so that they can be appended \nlight2.columns = light1.columns\n# append data frame while dropping the light condtion and class label from one of them\ncolon = light1.append(light2.iloc[2:])\n# drop the light condition from the other one\ncolon = colon.drop(1,axis=0).reset_index(drop=True)\n# transpose the data frame so that instances are rows and features are columns\ncolon = colon.T.reset_index(drop=True).rename(columns={0:"label"})\n```\n\n\n```python\ncolon.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
label123456789...1387138813891390139113921393139413951396
03.0138.1201606.800.0038750.0058800.0052130.0069350.0073330.0095800.007380...0.0139940.0135320.0131570.0127430.0126130.0124220.0122520.0113770.0111980.011131
13.090.8961904.300.0041470.0067280.0050610.0068790.0079480.0095250.010492...0.0035640.0033800.0032320.0032000.0030060.0029850.0029220.0026310.0026100.002531
23.0147.090822.320.0038710.0052110.0058340.0069710.0110360.0128020.011083...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
33.0148.7302412.500.0033360.0076950.0041390.0057360.0057940.0066970.007721...0.0208220.0201150.0195950.0192520.0188970.0181770.0181580.0175870.0171090.016648
43.0129.700599.950.0059170.0079340.0069760.0076950.0084040.0088250.010306...0.0001400.0001290.0001170.0001110.0001060.0000980.0000930.0000820.0000790.000076
\n

5 rows \xd7 1397 columns

\n
\n\n\n\nNow that looks a lot tidier. Let\'s also change the class labels: 1 becomes 0 for benign; 2 and 3 become 1 for malignant. \n\n\n```python\ncolon.label[colon.label==1] = 0\ncolon.label[(colon.label==2) | (colon.label==3)] = 1\ncolon.sample(5)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
label123456789...1387138813891390139113921393139413951396
91.0127.471406.800.0028020.0042760.0038060.0051460.0054100.0061050.007818...0.0031100.0030530.0028980.0027740.0026320.0024930.0024500.0023120.0021790.002090
400.0140.73711.590.0061860.0098270.0068690.0098440.0104940.0128540.012929...0.0000020.0000020.0000010.0000010.0000010.0000010.0000010.0000010.0000010.000001
361.0163.111385.900.0082640.0115290.0065500.0072910.0120660.0136780.015943...0.0000500.0000480.0000460.0000460.0000440.0000390.0000330.0000320.0000300.000027
470.0154.63566.700.0053860.0082250.0061840.0087110.0068670.0081700.013066...0.0000050.0000050.0000050.0000040.0000040.0000040.0000030.0000030.0000030.000003
221.0158.803795.400.0055930.0107210.0092790.0119310.0142840.0137490.018370...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
\n

5 rows \xd7 1397 columns

\n
\n\n\n\n\n```python\n# look for missing values\ncolon.isna().sum()[colon.isna().sum()>0]\n```\n\n\n\n\n Series([], dtype: int64)\n\n\n\n\n```python\n# look at class distribution\ncolon.label.value_counts()\n```\n\n\n\n\n 1.0 55\n 0.0 21\n Name: label, dtype: int64\n\n\n\nLooks like we don\'t have to clean the data anymore. We should now split off our test set.\n\n\n```python\nfrom sklearn.model_selection import StratifiedShuffleSplit\n# put 20% of the data aside as a test set\nsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)\nfor train_index, test_index in split.split(colon, colon["label"]):\n colon_train = colon.loc[train_index]\n colon_test = colon.loc[test_index]\n X_train = colon_train.iloc[:,1:]\n y_train = colon_train.loc[:,"label"]\n X_test = colon_test.iloc[:,1:]\n y_test = colon_test.loc[:,"label"]\n\n```\n\n#### Selecting and tuning models\nLet\'s try to get a base line first by training an SVC model with all the features and no preprocessing besides standard scaling. We can than compare the performance of the lower dimensionality models with this benchmark.\n\n\n```python\n# import all sklearn functions needed\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.svm import SVC\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import confusion_matrix\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import f1_score\n\n# put a standard scaler and a support vector calssifier in a pipeline\npipe = Pipeline(steps=[\n ("scaler",StandardScaler()),\n ("svc",SVC(class_weight="balanced"))])\n\n# define paramter space to search durring grid search\nparam_grid = {"svc__kernel":["linear","rbf"],\n "svc__C":np.logspace(-6,4,5),\n "svc__gamma":np.logspace(-8,1,5)}\n\n# perform grid search\ngrid = GridSearchCV(pipe, param_grid=param_grid,\n n_jobs=-1, cv=6, scoring="f1",\n iid=False, verbose=0)\ngrid.fit(X_train,y_train)\n\nprint("Cross validation Grid search:")\nprint("Best parameter (CV F1 score=%0.3f):" % grid.best_score_)\nprint(grid.best_params_)\n\nprint("\\nPerformance on the test set:\\n")\nprint("Confusion matrix:")\ny_pred = grid.predict(X_test)\nprint(confusion_matrix(y_test, y_pred))\nprint("F1: {:0.3f}".format(f1_score(y_test, y_pred)))\nprint("Recall: {:0.3f}".format(recall_score(y_test, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_test, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))\n```\n\n Cross validation Grid search:\n Best parameter (CV F1 score=0.880):\n {\'svc__C\': 0.1, \'svc__gamma\': 1e-08, \'svc__kernel\': \'linear\'}\n \n Performance on the test set:\n \n Confusion matrix:\n [[ 4 0]\n [ 2 10]]\n F1: 0.909\n Recall: 0.833\n Precision: 1.000\n Accuracy: 0.875\n \n\nWe actually perform already very well without any fancy preprocessing, already beating the human experts who\'s binary classification accuracy is around 0.796. But we still have a few false negatives in our test set classification which can be a severe problem for malignant tumors. If a patient is told that they\'re fine, the tumor may get a lot worse until they notice that the diagnosis was wrong. Maybe we can improve our benchmark results.\n\nLet\'s start with a simple PCA with only two PCs so that we can plot our data points \n\n\n```python\nfrom sklearn.decomposition import PCA\n# do a PCA with 2 PCs\n# Note: the infput data is centred aroud 0 automatically before the PCA,\n# the whiten parameter scales the output to unit variance, so it basically acts\n# like a standard scaler\npca = PCA(n_components=2, whiten=True)\nX_train_pca = pca.fit_transform(X_train)\nprint("Variance explained by the first two PCs: {:0.3f}".format(sum(pca.explained_variance_ratio_)))\n```\n\n Variance explained by the first two PCs: 0.921\n \n\nWow! 92% of the original training data is preserved when projecting it from 1396 dimensions to only two dimensions. Lets plot it: \n\n\n```python\n# split instances into malignant and benign\nidx_b = np.argwhere(y_train==0).reshape(1,-1)\nidx_m = np.argwhere(y_train==1).reshape(1,-1)\nmalignant = X_train_pca[idx_m].reshape(-1,2)\nbenign = X_train_pca[idx_b].reshape(-1,2)\n\n# plot on a scatter plot\nplt.plot(malignant[:,0],malignant[:,1],"^r",label="malignant",ms=8,alpha=.5)\nplt.plot(benign[:,0],benign[:,1],".b",label="benign",ms=10,alpha=.5)\n_ = plt.legend()\n```\n\n\n\n\n\nWe can see that benign and malignant lesions tend to different regions of the figure but there i some significant overlap of points which would probably hard to deconvolve. Maybe we\'ve lost too much of the original data\'s information and we should chose the number of PCs according to how much variance we want to preserve. We can specify that percentage instead the number of PCs.\n\n\n```python\n# do a PCA preserving 0.95 of the original data\'s variance\nvar = 0.95\npca = PCA(n_components=var)\nX_train_pca = pca.fit_transform(X_train)\nn_comp = len(pca.components_)\nprint("Preserving {} variance requires {} PCs.".format(var,n_comp))\n```\n\n Preserving 0.95 variance requires 3 PCs.\n \n\n\n```python\n# do a PCA preserving 0.99 of the original data\'s variance\nvar = 0.99\npca = PCA(n_components=var)\nX_train_pca = pca.fit_transform(X_train)\nn_comp = len(pca.components_)\nprint("Preserving {} variance requires {} PCs.".format(var,n_comp))\n```\n\n Preserving 0.99 variance requires 6 PCs.\n \n\n\n```python\n# do a PCA preserving 0.999 of the original data\'s variance\nvar = 0.999\npca = PCA(n_components=var)\nX_train_pca = pca.fit_transform(X_train)\nn_comp = len(pca.components_)\nprint("Preserving {} variance requires {} PCs.".format(var,n_comp))\n```\n\n Preserving 0.999 variance requires 11 PCs.\n \n\nThe amount of preserved variance can be used as a hyper-parameter in a grid search.\n\n\n```python\n# put a standard-scaled PCA and a support vector calssifier in a pipeline\npipe = Pipeline(steps=[\n ("pca",PCA(whiten=True)),\n ("svc",SVC(kernel="linear",class_weight="balanced"))])\n\n# define paramter space to search durring grid search\nparam_grid = {"svc__C":np.logspace(-4,6,5),\n "pca__n_components":[0.95,0.99,0.999]}\n\n# perform grid search\ngrid = GridSearchCV(pipe, param_grid=param_grid,\n n_jobs=-1, cv=6, scoring="f1",\n iid=False, verbose=0)\ngrid.fit(X_train,y_train)\n\nprint("Cross validation Grid search:")\nprint("Best parameter (CV F1 score=%0.3f):" % grid.best_score_)\nprint(grid.best_params_)\n\nprint("\\nPerformance on the test set:\\n")\nprint("Confusion matrix:")\ny_pred = grid.predict(X_test)\nprint(confusion_matrix(y_test, y_pred))\nprint("F1: {:0.3f}".format(f1_score(y_test, y_pred)))\nprint("Recall: {:0.3f}".format(recall_score(y_test, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_test, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))\n```\n\n Cross validation Grid search:\n Best parameter (CV F1 score=0.916):\n {\'pca__n_components\': 0.999, \'svc__C\': 3162.2776601683795}\n \n Performance on the test set:\n \n Confusion matrix:\n [[ 4 0]\n [ 2 10]]\n F1: 0.909\n Recall: 0.833\n Precision: 1.000\n Accuracy: 0.875\n \n\nThe cross-validation score was slightly improved, but in our test set there are still two instances misclassified. There is a version of kernel PCA (which basically employs the kernel trick like support vector machines) which allows it to perform nonlinear projections for dimensionality reduction. Let\'s see if we can improve our results with kernel PCA. Unfortunately we can not directly specify the amount of variance we want to preserve, so we\'ll have to calculate manually how many components will be needed for a certain desired preserved variance given a certain kernel (e.g. linear, rbf, polynomial, cosine, using default parameters).\n\n\n```python\nfrom sklearn.decomposition import KernelPCA\n\n# set up a kPCA with linear kernel and calculate all PCs\nkpca = KernelPCA(kernel="linear")\nX_train_kpca = kpca.fit_transform(X_train)\n# calculate the explained variance\nexplained_variance = np.var(X_train_kpca, axis=0)\nexplained_variance_ratio = explained_variance / np.sum(explained_variance)\ncumsum = np.cumsum(explained_variance_ratio)\n# print results\nprint("For kPCA with linear kernel:")\nfor v in [0.95,0.99,0.999]:\n for i,c in enumerate(cumsum):\n if c>v:\n print("Preserving {} variance requires {} PCs.".format(v,i+1))\n break\n```\n\n For kPCA with linear kernel:\n Preserving 0.95 variance requires 3 PCs.\n Preserving 0.99 variance requires 6 PCs.\n Preserving 0.999 variance requires 11 PCs.\n \n\n\n```python\n# set up a kPCA with rbf kernel and calculate all PCs\nkpca = KernelPCA(kernel="rbf")\nX_train_kpca = kpca.fit_transform(X_train)\n# calculate the explained variance\nexplained_variance = np.var(X_train_kpca, axis=0)\nexplained_variance_ratio = explained_variance / np.sum(explained_variance)\ncumsum = np.cumsum(explained_variance_ratio)\n# print results\nprint("For kPCA with rbf kernel:")\nfor v in [0.95,0.99,0.999]:\n for i,c in enumerate(cumsum):\n if c>v:\n print("Preserving {} variance requires {} PCs.".format(v,i+1))\n break\n```\n\n For kPCA with rbf kernel:\n Preserving 0.95 variance requires 57 PCs.\n Preserving 0.99 variance requires 59 PCs.\n Preserving 0.999 variance requires 59 PCs.\n \n\n\n```python\n# set up a kPCA with polynomial kernel and calculate all PCs\nkpca = KernelPCA(kernel="poly")\nX_train_kpca = kpca.fit_transform(X_train)\n# calculate the explained variance\nexplained_variance = np.var(X_train_kpca, axis=0)\nexplained_variance_ratio = explained_variance / np.sum(explained_variance)\ncumsum = np.cumsum(explained_variance_ratio)\n# print results\nprint("For kPCA with polynomial kernel:")\nfor v in [0.95,0.99,0.999]:\n for i,c in enumerate(cumsum):\n if c>v:\n print("Preserving {} variance requires {} PCs.".format(v,i+1))\n break\n```\n\n For kPCA with polynomial kernel:\n Preserving 0.95 variance requires 1 PCs.\n Preserving 0.99 variance requires 2 PCs.\n Preserving 0.999 variance requires 4 PCs.\n \n\n\n```python\n# set up a kPCA with cosine kernel and calculate all PCs\nkpca = KernelPCA(kernel="cosine")\nX_train_kpca = kpca.fit_transform(X_train)\n# calculate the explained variance\nexplained_variance = np.var(X_train_kpca, axis=0)\nexplained_variance_ratio = explained_variance / np.sum(explained_variance)\ncumsum = np.cumsum(explained_variance_ratio)\n# print results\nprint("For kPCA with cosine kernel:")\nfor v in [0.95,0.99,0.999]:\n for i,c in enumerate(cumsum):\n if c>v:\n print("Preserving {} variance requires {} PCs.".format(v,i+1))\n break\n```\n\n For kPCA with cosine kernel:\n Preserving 0.95 variance requires 8 PCs.\n Preserving 0.99 variance requires 12 PCs.\n Preserving 0.999 variance requires 18 PCs.\n \n\nIf we want to use both the kernel type and the preserved variance level in hyper-parameter tuning, we should write our own transformer class which handles that easily.\n\n\n```python\nfrom sklearn.base import BaseEstimator, TransformerMixin\n\nclass kPCA(BaseEstimator, TransformerMixin):\n """This class allows to tune kernel-type and variance-to-preserve as hyper-paramters"""\n \n pc_dict = {"linear":{0.95:3,0.99:6,0.999:11},\n "rbf":{0.95:57,0.99:59,0.999:59},\n "poly":{0.95:1,0.99:2,0.999:4},\n "cosine":{0.95:8,0.99:12,0.999:18}}\n \n def __init__(self, kernel="linear", var=0.95):\n self.kernel = kernel\n self.var = var\n self.kpca = KernelPCA(kernel=self.kernel,\n n_components=self.pc_dict[kernel][var])\n \n def fit(self, X, y=None):\n self.kpca.fit(X)\n return self\n \n def transform(self, X, y=None):\n return self.kpca.transform(X)\n```\n\nLet\'s see if we can up our performance in a final grid search with different kernel PCAs.\n\n\n```python\n# put the custom kPCA, a standard scaler, and a support vector calssifier in a pipeline\npipe = Pipeline(steps=[\n ("kpca",kPCA()),\n ("scaler", StandardScaler()),\n ("svc",SVC(kernel="linear",class_weight="balanced"))])\n\n# define paramter space to search durring grid search\nparam_grid = {"kpca__kernel":["linear","rbf","poly","cosine"],\n "kpca__var":[0.95,0.99,0.999],\n "svc__C":np.logspace(-6,6,4)}\n\n# perform grid search\ngrid = GridSearchCV(pipe, param_grid=param_grid,\n n_jobs=-1, cv=6, scoring="f1",\n iid=False, verbose=0)\ngrid.fit(X_train,y_train)\n\nprint("Cross validation Grid search:")\nprint("Best parameter (CV F1 score=%0.3f):" % grid.best_score_)\nprint(grid.best_params_)\n\nprint("\\nPerformance on the test set:\\n")\nprint("Confusion matrix:")\ny_pred = grid.predict(X_test)\nprint(confusion_matrix(y_test, y_pred))\nprint("F1: {:0.3f}".format(f1_score(y_test, y_pred)))\nprint("Recall: {:0.3f}".format(recall_score(y_test, y_pred)))\nprint("Precision: {:0.3f}".format(precision_score(y_test, y_pred)))\nprint("Accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))\n```\n\n Cross validation Grid search:\n Best parameter (CV F1 score=0.870):\n {\'kpca__kernel\': \'linear\', \'kpca__var\': 0.95, \'svc__C\': 100.0}\n \n Performance on the test set:\n \n Confusion matrix:\n [[ 4 0]\n [ 2 10]]\n F1: 0.909\n Recall: 0.833\n Precision: 1.000\n Accuracy: 0.875\n \n\nWell, looks like we weren\'t really able to achieve much of an improvement by dimensionality reduction but we also didn\'t make it worse. That means that the information that is in the data was as easy to grasp for the algorithm in higher dimensional space as in dimensionality reduced space. We do gain one big advantage through dimensionality reduction though: Significantly less computational cost! For this example with only 76 instances this may not seem relevant, but for much larger data sets it will be a massive speed-up if we can reduce the number of features from hundreds to only about a dozen without sacrificing too much classification accuracy (or what ever score we use).\n\n### Optimized deep neural networks learning poker hands \n\nThis data set contains examples of possible poker hands which can be used to let a machine learning algorithm learn the rules of the game. It was used in a [research paper](https://pdfs.semanticscholar.org/c068/ea7807367573f4b5f98c0681fca665e9ef74.pdf) in which the authors, R. Cattral, F. Oppacher, and D. Deugo, used evolutionary and symbolic machine learning methods to extract comprehensible and strong rules (the rules of poker in this case) from it. We will try to achieve this with an optimized neural network. You can find the Jupyter Notebook and the data [here](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository/tree/master/Deep-learning-on-poker-hands).\n\nThe data set was contributed to the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Poker+Hand) by Robert Cattral and Franz Oppacher from the Department of Computer Science of Carleton University, which is where I got it from.\n\n#### Frame the Problem\nIn this small project, we\'re not going all the way to building a full poker-bot that can play the entire game of poker against other agents. We just want to see if and how well an algorithm can understand the rules of how poker hands (the combination of cards that result in a certain score) are composed.\n\nThe data we will consider contains one million instances of poker hands. Each poker hand in the data set is an example consisting of five cards which are drawn from a poker deck of 52 cards. Each card has two attributes, its suit (S, 1 to 4, representing Hearts, Spades, Diamonds, and Clubs) and its rank (C, 1 to 13, representing Ace, 2, 3, ... , Queen, and King). For each instance (poker hand) that results in 10 features. There are ten possible classes (0 to 9) which correspond to the card [combinations](https://en.wikipedia.org/wiki/Poker#Gameplay) that can be observed in the game of poker: nothing in hand, one pair, two pairs, three of a kind, straight, flush, full house, four of a kind, straight flush, and royal flush.\n\nWe will treat this as a supervised multi-class classification problem. The rules of this classification are fairly complicated but we also have a lot of instances to train on, which makes the problem very suitable for deep neural networks. We will work with a multilayer perceptron architecture, implement it with `TensorFlow 2.0`\'s `Keras` API, and optimize its hyper-parameters with the `hyperas` library using a bayesian optimization.\n\n\n```python\n# import the libraries we\'ll need\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns \nsns.set_style("darkgrid")\n%matplotlib notebook\nimport warnings\nwarnings.filterwarnings(\'ignore\')\nimport tensorflow as tf\nimport tensorflow.keras as keras\n```\n\n#### Data preparation\nLet\'s import the data into data frame hand have a look.\n\n\n```python\n# import data from CSV file\ndf = pd.read_csv("poker-hand-testing.data",\n names = ["S1","C1","S2","C2","S3","C3",\n "S4","C4","S5","C5","target"])\n\n# order the columns to have the ranks (C, numerical) \n# and suits (S, categorical) together\ndf = df[["C1","C2","C3","C4","C5",\n "S1","S2","S3","S4","S5",\n "target"]]\n\n# let\'s have a look by taking a random sample\ndf.sample(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
C1C2C3C4C5S1S2S3S4S5target
808385386123413431
2348009125105323321
196245126739432220
60954312111368232330
96154711311106324441
325426912446223211
752949137462444120
9581191661310414441
1169071108411341210
43999329482132241
\n
\n\n\n\n\n```python\n# print the data frame\'s info\ndf.info()\n```\n\n \n RangeIndex: 1000000 entries, 0 to 999999\n Data columns (total 11 columns):\n C1 1000000 non-null int64\n C2 1000000 non-null int64\n C3 1000000 non-null int64\n C4 1000000 non-null int64\n C5 1000000 non-null int64\n S1 1000000 non-null int64\n S2 1000000 non-null int64\n S3 1000000 non-null int64\n S4 1000000 non-null int64\n S5 1000000 non-null int64\n target 1000000 non-null int64\n dtypes: int64(11)\n memory usage: 83.9 MB\n \n\n\n```python\n# look for missing values\ndf.isna().sum()\n```\n\n\n\n\n C1 0\n C2 0\n C3 0\n C4 0\n C5 0\n S1 0\n S2 0\n S3 0\n S4 0\n S5 0\n target 0\n dtype: int64\n\n\n\nWe can see that there are apparently no missing values. Let\'s look at how the target classes are distributed.\n\n\n```python\ndf.target.value_counts().sort_index()\n```\n\n\n\n\n 0 501209\n 1 422498\n 2 47622\n 3 21121\n 4 3885\n 5 1996\n 6 1424\n 7 230\n 8 12\n 9 3\n Name: target, dtype: int64\n\n\n\nAs we can see, we\'re dealing with a very strong class imbalance. Some of the poker hands are much rarer than others. We can try to account for this by handing class weights to the classifier but it will certainly be very difficult, if not impossible, to correctly classify e.g. a straight flush or royal flush.\n\nLet\'s also have a look at how the features are distributed to make sure there are no outliers.\n\n\n```python\n# plot feature distribution as histograms\n_ = df.hist(bins=13,figsize=(6,5))\nplt.tight_layout()\n```\n\n\n\n\n\nGreat, looks like the data is actually distributed as specified. No need to account for invalid values. We can now proceed to splitting of test and validation data sets. Because of the class imbalance, we will stratify the split so that approximately equal class distributions will be present in all sets. For the very rare classes this may not work perfectly. We\'ll also compute the class weights already.\n\n\n```python\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.class_weight import compute_class_weight\n\n# split off a test set and stratify for classes (target)\ntrain_full, test = train_test_split(df,\n stratify = df["target"],\n test_size=0.2, \n random_state=123)\n\n# split into a training and a validation set and stratify for classes (target)\ntrain, valid = train_test_split(train_full,\n stratify = train_full["target"],\n test_size=0.2,\n random_state=123)\n\n# compute class weigths which will be used to account for class imbalance\nclass_weights = compute_class_weight("balanced",\n np.unique(train["target"]),\n train["target"])\n```\n\nI really like using handy `pandas` and `sklearn` functions for exploring and preprocessing moderate-sized data. But for really big data that has to be distributed over several machines, we might prefer to use `dask` data frames or to do everything just in `TensorFlow`. `TensorFlow`\'s data API can read data from different sources, handle preprocessing (even though I personally don\'t always find it very handy), and provide tensor datasets and `TensorFlow` also offers different kinds of feature columns which can be directly fed into a model. We\'ll use these features here to explore them a bit, even though we could also just use `sklearn`\'s preprocessing on `pandas`\'s data frames here on my single poor old laptop.\n\nWe\'ll first define a function that turns the `pandas` data frames into `TensorFlow` data sets.\n\n\n```python\ndef df_to_dataset(dataframe, shuffle=True, batch_size=32):\n """\n A function that turns pandas data frames into TensorFlow data sets.\n Params:\n ----------\n dataframe: input pandas data frame\n shuffle: if True, the data set will be pre-shuffled\n batch_size: batch size, meaning number of instances that \n will be passed to model per optimization step\n Returns:\n ---------\n ds: the output data set\n """\n # get copy of data frame\n dataframe = dataframe.copy()\n \n # extract class labels\n labels = dataframe["target"]\n \n # make data set from features (first 10 columns) and labels\n # note: it\'s important to convert data type from int to float32\n # here, TF doesn\'t do that automatically and will throw error\n ds = tf.data.Dataset.from_tensor_slices((dict(dataframe.iloc[:,:10]\n .astype(np.float32)), \n labels))\n # shuffle if desired\n if shuffle:\n ds = ds.shuffle(buffer_size=10000)\n \n # enable infinite repeat with given batch size and prefetch\n ds = ds.repeat().batch(batch_size).prefetch(1)\n \n return ds\n```\n\nNow we can actually transform the data frames into `TensofFlow` data sets which will be fed to our model. But we\'ll have to choose the batch size first, how many instances will be passed through the model per optimization step.\n\nThe batch size should be large enough to give a precise enough estimate of gradients during optimization but not so large that it significantly slows down the training iterations. In practice 32 is often chosen as default. At least that\'s what I understood from `TensofFlow` tutorials and a couple of deep learning books. I have no reason not to trust this advice here.\n\n\n```python\nbatch_size = 32\n# create tensorflow data sets from data frames\ntrain_ds = df_to_dataset(train, batch_size=batch_size)\nvalid_ds = df_to_dataset(valid, shuffle=False, batch_size=batch_size)\ntest_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)\n```\n\nWe will want to normalize the numerical data (first 5 columns) before we send it into the model. We can provide a normalization function to the numerical feature column later but we\'ll first have to compute each feature\'s mean and standard deviation. We can compute them as `TensorFlow` tensors as well.\n\n\n```python\n# calculate means and standard deviations for the numerical features\nX_means = tf.math.reduce_mean(train.iloc[:,:5].values.astype(np.float32),axis=0)\nX_stds = tf.math.reduce_std(train.iloc[:,:5].values.astype(np.float32),axis=0)\n# since they all have approximately the same distribution, the means should all\n# be around 7.0 and the standard deviations around 3.74\n```\n\nWe can now define the feature columns. We will have 5 numerical columns corresponding to the cards\' ranks, which we will normalize. There will also be 5 categorical columns for the suits. Since they are ordinal and represented as integers here, we can first put them in 5 numerical columns and then transform these into 5 bucketized columns with 4 buckets each, one for each suit.\n\n\n```python\n# collect all feature columns in a list which we\'ll\n# later feed to the model\'s input layer\nfeature_columns = []\n\n# set up the numerical columns for the cards\' ranks\n# with normalization function: (x-mean)/std\nfor i, header in enumerate(["C1", "C2", "C3", "C4", "C5"]):\n # set up numerical column\n num_col = tf.feature_column.numeric_column(header,\n normalizer_fn=lambda X:\n (X - X_means[i]) / X_stds[i])\n # append column to list\n feature_columns.append(num_col)\n\n# set up the bucket columns for the cards\' suits\nfor header in ["S1", "S2", "S3", "S4", "S5"]:\n\n # set up bucket column\n num_col = tf.feature_column.numeric_column(header)\n bucket_col = tf.feature_column.bucketized_column(num_col,\n boundaries=list(\n range(2, 5)))\n # append column to list\n feature_columns.append(bucket_col)\n```\n\n#### Building the model\nNow let\'s set up an initial model with Keras\' seqential API, without thinking too much about parameters yet. We\'ll use 2 hidden layers, 200 units per layer, with exponential linear unit (elu) activation functions and He-initialized weights. After each hidden layer, we\'ll apply batch normalization to prevent vanishing or exploding gradients during training. We can also implement a early-stopping callback which will act against overfitting, as it just aborts training when the validation loss begins to increase again.\n\n\n```python\n# bulding the sequential model\nmodel = keras.Sequential([\n keras.layers.DenseFeatures(feature_columns),\n keras.layers.Dense(200, activation=\'elu\',kernel_initializer="he_normal"),\n keras.layers.BatchNormalization(),\n keras.layers.Dense(200, activation=\'elu\',kernel_initializer="he_normal"),\n keras.layers.BatchNormalization(),\n keras.layers.Dense(10, activation=\'softmax\')\n])\n\n# model checkpoint call back to save checkpints during training\ncheckpoint_cb = keras.callbacks.ModelCheckpoint("training/poker_hand_model.ckpt",\n save_best_only=True,\n save_weights_only=True,\n verbose=1)\n\n# early stopping callback to prevent overfitting on the training data\nearlystop_cb = keras.callbacks.EarlyStopping(patience=3,\n min_delta=0.01,\n restore_best_weights=True,\n verbose=1)\n\n\n# Compile the model with a Nadam optimizer, initial learning rate \n# of 0.01 is relatively high.\n# Since classes are given in one vector as integers from 0 to 9,\n# we have to use sparse_categorical_crossentropy instead of\n# categorical_crossentropy as the loss function.\nmodel.compile(optimizer=keras.optimizers.Nadam(learning_rate=0.01),\n loss="sparse_categorical_crossentropy",\n metrics=["acc"])\n\n# fit the model with trianing and validation data, considering class weights \n# and early stopping (hence 100 epochs probably wont run)\nhistory = model.fit(train_ds,\n steps_per_epoch=len(train) // batch_size,\n validation_data=valid_ds,\n validation_steps=len(valid) // batch_size,\n class_weight=class_weights,\n epochs=100,\n callbacks=[earlystop_cb,checkpoint_cb])\n```\n\n Train for 20000 steps, validate for 5000 steps\n Epoch 1/100\n 19994/20000 [============================>.] - ETA: 0s - loss: 0.8818 - acc: 0.5876\n Epoch 00001: val_loss improved from inf to 0.80858, saving model to training/poker_hand_model.ckpt\n 20000/20000 [==============================] - 130s 7ms/step - loss: 0.8817 - acc: 0.5877 - val_loss: 0.8086 - val_acc: 0.6579\n Epoch 2/100\n 19998/20000 [============================>.] - ETA: 0s - loss: 0.6582 - acc: 0.7131\n Epoch 00002: val_loss did not improve from 0.80858\n 20000/20000 [==============================] - 115s 6ms/step - loss: 0.6582 - acc: 0.7131 - val_loss: 1.1534 - val_acc: 0.6660\n Epoch 3/100\n 19999/20000 [============================>.] - ETA: 0s - loss: 0.3995 - acc: 0.8401\n Epoch 00003: val_loss improved from 0.80858 to 0.77816, saving model to training/poker_hand_model.ckpt\n 20000/20000 [==============================] - 121s 6ms/step - loss: 0.3995 - acc: 0.8401 - val_loss: 0.7782 - val_acc: 0.8856\n Epoch 4/100\n 19994/20000 [============================>.] - ETA: 0s - loss: 0.3132 - acc: 0.8805\n Epoch 00004: val_loss improved from 0.77816 to 0.31105, saving model to training/poker_hand_model.ckpt\n 20000/20000 [==============================] - 130s 7ms/step - loss: 0.3132 - acc: 0.8805 - val_loss: 0.3110 - val_acc: 0.9051\n Epoch 5/100\n 19998/20000 [============================>.] - ETA: 0s - loss: 0.2695 - acc: 0.8996\n Epoch 00005: val_loss did not improve from 0.31105\n 20000/20000 [==============================] - 118s 6ms/step - loss: 0.2695 - acc: 0.8996 - val_loss: 1.1440 - val_acc: 0.7433\n Epoch 6/100\n 19989/20000 [============================>.] - ETA: 0s - loss: 0.2236 - acc: 0.9187\n Epoch 00006: val_loss did not improve from 0.31105\n 20000/20000 [==============================] - 123s 6ms/step - loss: 0.2235 - acc: 0.9187 - val_loss: 8.0383 - val_acc: 0.8846\n Epoch 7/100\n 19993/20000 [============================>.] - ETA: 0s - loss: 0.2006 - acc: 0.9282Restoring model weights from the end of the best epoch.\n \n Epoch 00007: val_loss did not improve from 0.31105\n 20000/20000 [==============================] - 120s 6ms/step - loss: 0.2006 - acc: 0.9282 - val_loss: 5.8794 - val_acc: 0.9416\n Epoch 00007: early stopping\n \n\nSince the training took a long time we better want to save (or load) the model in a serialized form.\n\n\n```python\n# save the model (in TensorFlow\'s serialized SavedModel format, \n# you can also save it in HDF5 format by adding the file ending .h5)\nmodel.save("saved_model/poker_hand_keras_model")\n\n# load the model again\nmodel = keras.models.load_model("saved_model/poker_hand_keras_model")\n```\n\nWe can evaluate the model on the test set.\n\n\n```python\n# evaluate model on test set\nloss, accuracy = model.evaluate(test_ds,steps=len(test)//batch_size)\nprint("Accuracy: ", accuracy, "\\nLoss: ", loss)\n```\n\n 6250/6250 [==============================] - 16s 3ms/step - loss: 0.2396 - acc: 0.9068\n Accuracy: 0.906845 \n Loss: 0.23963412045776844\n \n\nWow, 95% accuracy, not bad for the first shot. Let\'s make predictions for the first 10 instances of our test set. We have a look at the targets first.\n\n\n```python\ntest.iloc[0:10,:].target.values\n```\n\n\n\n\n array([2, 1, 0, 1, 1, 1, 0, 0, 0, 1])\n\n\n\n\n```python\n# get the first 10 instances from the test data and convert them to data set format\nnew_data = df_to_dataset(test.iloc[0:10,:],shuffle=False,batch_size=10)\n\n# predict class probability\npred_proba = model.predict(new_data,steps=1) \n\n# predict classes\npred_calsses = np.argmax(pred_proba,axis=1)\n\npred_calsses\n```\n\n\n\n\n array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1])\n\n\n\nIf we have a look at the probabilities predicted for the classes, we can see that the model is actually not very confident in it\'s prediction.\n\n\n```python\nnp.round(pred_proba[0],decimals=3)\n```\n\n\n\n\n array([0. , 0.751, 0.248, 0.001, 0. , 0. , 0. , 0. , 0. ,\n 0. ], dtype=float32)\n\n\n\nIt would be nice if we could be more certain for the minority classes as well. Accuracy is generally not a very god metric for problems with a high class imbalance. The macro-averaged F1 score may be a better metric to optimize for if we want to achieve a good classification for all classes. Furthermore, we\'ve just blindly guessed how many layers and units per layer and so on we use in the model. We can probably do even better by optimizing the hyper-parameters. That\'s what we\'ll do in the next section.\n\n#### Hyper-parameter optimization\n\nFor hyper-parameter optimization we will use the `hyperas` library, which is an easy-to-use Keras wrapper of the `hyperopt` library. To perform the optimization, we will first have to define functions which provide the training and validation data and build the model. I didn\'t manage to get the data function take global variables, `hyperas` would keep throwing errors; hence, I provided the entire data preprocessing again in this data function.\n\n\n```python\nfrom hyperopt import Trials, STATUS_OK, tpe\nfrom hyperas import optim\nfrom hyperas.distributions import choice, uniform, loguniform\nfrom sklearn.metrics import f1_score\n\n\n\n# function that provides the data for the optimization\ndef data():\n """Data providing function"""\n \n from sklearn.model_selection import train_test_split\n from sklearn.utils.class_weight import compute_class_weight\n from sklearn.preprocessing import StandardScaler\n from sklearn.compose import ColumnTransformer\n import warnings\n warnings.filterwarnings(\'ignore\')\n \n # import data from CSV file\n df = pd.read_csv("poker-hand-testing.data",\n names = ["S1","C1","S2","C2","S3","C3",\n "S4","C4","S5","C5","target"])\n \n # order the columns to have the ranks (C, numerical) \n # and suits (S, categorical) together\n df = df[["C1","C2","C3","C4","C5",\n "S1","S2","S3","S4","S5",\n "target"]]\n \n # one-hot encode the suits features\n df = pd.get_dummies(df,columns=["S1","S2","S3","S4","S5"])\n\n \n # split off a test set and stratify for classes (target)\n train_full, test = train_test_split(df,\n stratify = df["target"],\n test_size=0.2, \n random_state=123)\n \n # split into a training and a validation set and stratify for classes (target)\n train, valid = train_test_split(train_full,\n stratify = train_full["target"],\n test_size=0.2,\n random_state=123)\n \n # compute class weigths which will be used to account for class imbalance\n class_weights = compute_class_weight("balanced",\n np.unique(train["target"]),\n train["target"])\n \n # split features and labels\n X_train, y_train = train.drop("target",axis=1), train["target"].values\n X_valid, y_valid = valid.drop("target",axis=1), valid["target"].values\n X_test, y_test = test.drop("target",axis=1), test["target"].values\n \n # get rank and suit column names\n suit_cols = X_train.columns.drop(["C1","C2","C3","C4","C5"])\n rank_cols = ["C1","C2","C3","C4","C5"]\n \n # set up the preprocessor wit a scaler for the ranks\n preprocess = ColumnTransformer(transformers=[\n ("std",StandardScaler(),rank_cols),\n ("pass","passthrough",suit_cols)])\n \n # scale the rank values with a standard scaler\n X_train = preprocess.fit_transform(X_train)\n X_valid = preprocess.transform(X_valid)\n X_test = preprocess.transform(X_test)\n \n \n \n return X_train, y_train, X_vaild, y_vaild, X_test, y_test, class_weights\n \n# function that builds the model for the optimization\ndef build_model(X_train, y_train, X_vaild, y_vaild, X_test, y_test, class_weights):\n """Model providing function""" \n \n # parameters to optimize\n num_layers = {{choice([2,3,4])}}\n num_units = int({{uniform(50,400)}})\n learning_rate = {{loguniform(-8,-4)}}\n batch_size=32\n \n print(f"New parameters:\\nNumber of layers: {num_layers}\\nNumber of units: {num_units}\\nLearning rate: {learning_rate}")\n \n # create model and add input layer with feature columns\n model = keras.Sequential()\n model.add(keras.layers.InputLayer(input_shape=X_train.shape[1:],batch_size=batch_size))\n \n # add hidden layer\n for i in range(num_layers):\n model.add(keras.layers.Dense(units=num_units,\n activation=\'elu\',\n kernel_initializer="he_normal"))\n model.add(keras.layers.BatchNormalization())\n \n # add ourput layer\n model.add(keras.layers.Dense(units=10, \n activation=\'softmax\',))\n \n # compile model with optimizer\n model.compile(optimizer=keras.optimizers.Nadam(learning_rate=learning_rate),\n loss="sparse_categorical_crossentropy",\n metrics=["acc"])\n \n # fitthe model with class weights applied\n model.fit(x=X_train,\n y=y_train,\n batch_size=batch_size,\n validation_data=(X_valid,y_valid),\n class_weight=class_weights,\n epochs=1,\n verbose=0)\n \n # make class predictions with validation data\n pred_proba = model.predict(X_valid) \n pred_calsses = np.argmax(pred_proba,axis=1)\n \n # calculate the macro-averaged F1 score based on the predictions\n f1 = f1_score(y_valid,pred_calsses,average="macro")\n \n # print results of the optimization round\n print(f"F1 score: {f1:.3f}\\nfor\\nNumber of layers: {num_layers}\\nNumber of units: {num_units}\\nLearning rate: {learning_rate}")\n \n \n return {\'loss\': -f1, \'status\': STATUS_OK, \'model\': model}\n\n```\n\n Using TensorFlow backend.\n \n\nNow that the functions are set up, let\'s perform 30 optimization steps.\n\n\n```python\n# perform a hyperas optimization\nbest_run, best_model = optim.minimize(model=build_model,\n data=data,\n algo=tpe.suggest,\n max_evals=30,\n trials=Trials(),\n notebook_name=\'Deep_learning_poker_hand\')\n```\n\nLet\'s have a look at the best parameters we found. \n\n\n```python\nbest_run\n```\n\n\n```python\nbest_model.summary()\n```\n\nLooks like we should train a model with 3 hidden layers, 265 units per layer, and a learning rate of 0.12. Let\'s do that and see how good we can get.\n\n\n```python\ndef build_opt_model():\n # bulding the sequential model\n opt_model = keras.Sequential([\n keras.layers.DenseFeatures(feature_columns),\n keras.layers.Dense(265, activation=\'elu\', kernel_initializer="he_normal"),\n keras.layers.BatchNormalization(),\n keras.layers.Dense(265, activation=\'elu\', kernel_initializer="he_normal"),\n keras.layers.BatchNormalization(),\n keras.layers.Dense(265, activation=\'elu\', kernel_initializer="he_normal"),\n keras.layers.BatchNormalization(),\n keras.layers.Dense(10, activation=\'softmax\')\n ])\n \n # model checkpoint call back to save checkpints during training\n checkpoint_cb = keras.callbacks.ModelCheckpoint(\n "training/poker_hand_model.ckpt",\n save_best_only=True,\n save_weights_only=True,\n verbose=1)\n \n # early stopping callback to prevent overfitting on the training data\n earlystop_cb = keras.callbacks.EarlyStopping(patience=5,\n min_delta=0.01,\n restore_best_weights=True,\n verbose=1)\n \n # Compile the model with a Nadam optimizer \n opt_model.compile(optimizer=keras.optimizers.Nadam(learning_rate=0.012),\n loss="sparse_categorical_crossentropy",\n metrics=["acc"])\n \n return opt_model, checkpoint_cb, earlystop_cb\n \nopt_model, checkpoint_cb, earlystop_cb = build_opt_model()\n\n# fit the model with trianing and validation data, considering class weights\n# and early stopping (hence 100 epochs probably wont run)\nhistory = opt_model.fit(train_ds,\n steps_per_epoch=len(train) // batch_size,\n validation_data=valid_ds,\n validation_steps=len(valid) // batch_size,\n class_weight=class_weights,\n epochs=100,\n callbacks=[earlystop_cb, checkpoint_cb])\n```\n\n Train for 20000 steps, validate for 5000 steps\n Epoch 1/100\n 19996/20000 [============================>.] - ETA: 0s - loss: 0.8743 - acc: 0.5968\n Epoch 00001: val_loss improved from inf to 0.68111, saving model to training/poker_hand_model.ckpt\n 20000/20000 [==============================] - 189s 9ms/step - loss: 0.8742 - acc: 0.5969 - val_loss: 0.6811 - val_acc: 0.7055\n Epoch 2/100\n 19998/20000 [============================>.] - ETA: 0s - loss: 0.4881 - acc: 0.8041\n Epoch 00002: val_loss did not improve from 0.68111\n 20000/20000 [==============================] - 178s 9ms/step - loss: 0.4881 - acc: 0.8041 - val_loss: 1.3419 - val_acc: 0.6085\n Epoch 3/100\n 19996/20000 [============================>.] - ETA: 0s - loss: 0.3234 - acc: 0.8769\n Epoch 00003: val_loss did not improve from 0.68111\n 20000/20000 [==============================] - 191s 10ms/step - loss: 0.3234 - acc: 0.8769 - val_loss: 98.4170 - val_acc: 0.8010\n Epoch 4/100\n 19993/20000 [============================>.] - ETA: 0s - loss: 0.2656 - acc: 0.9019\n Epoch 00004: val_loss did not improve from 0.68111\n 20000/20000 [==============================] - 182s 9ms/step - loss: 0.2656 - acc: 0.9019 - val_loss: 1.0344 - val_acc: 0.6891\n Epoch 5/100\n 19994/20000 [============================>.] - ETA: 0s - loss: 0.2323 - acc: 0.9150\n Epoch 00005: val_loss did not improve from 0.68111\n 20000/20000 [==============================] - 182s 9ms/step - loss: 0.2323 - acc: 0.9150 - val_loss: 0.8909 - val_acc: 0.8548\n Epoch 6/100\n 19996/20000 [============================>.] - ETA: 0s - loss: 0.2076 - acc: 0.9251Restoring model weights from the end of the best epoch.\n \n Epoch 00006: val_loss did not improve from 0.68111\n 20000/20000 [==============================] - 186s 9ms/step - loss: 0.2076 - acc: 0.9251 - val_loss: 1609.0826 - val_acc: 0.8910\n Epoch 00006: early stopping\n \n\nLet\'s save the model again see how we perform on the F1 score now.\n\n\n```python\n# save the model\nopt_model.save("saved_model/poker_hand_keras_opt_model")\n\n# load the model again\nopt_model = keras.models.load_model("saved_model/poker_hand_keras_opt_model")\n```\n\n WARNING:tensorflow:From /home/pascal/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1781: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n Instructions for updating:\n If using Keras pass *_constraint arguments to layers.\n INFO:tensorflow:Assets written to: saved_model/poker_hand_keras_opt_model/assets\n \n\n\n```python\nfrom sklearn.metrics import f1_score\n\n# make predictions for the test set (steps = number of instances / batch size)\npred_proba = opt_model.predict(test_ds,steps=6250) \npred_calsses = np.argmax(pred_proba,axis=1)\n \n# calculate the macro-averaged F1 score based on the predictions\nf1_macro = f1_score(test["target"].values,pred_calsses,average="macro")\nf1_micro = f1_score(test["target"].values,pred_calsses,average="micro")\n\nprint(f"F1-macro: {f1_macro:.2f}\\nF1-micro: {f1_micro:.2f}")\n```\n\n F1-macro: 0.19\n F1-micro: 0.71\n \n\nWe can see that the micro-averaged F1 score, which considers all instances individually, is not too bad. But the macro-averaged F1 score, which is an unweighted average of every classes\' F1 score, is much worse. This clearly shows that we still have a big problem with classifying the strongly underrepresented classes correctly. This becomes clear when we look at the confusion matrix below. \n\n\n```python\nfrom sklearn.metrics import confusion_matrix\nconfusion_matrix(test["target"].values,pred_calsses)\n```\n\n\n\n\n array([[95597, 4644, 1, 0, 0, 0, 0, 0, 0,\n 0],\n [40088, 44265, 53, 94, 0, 0, 0, 0, 0,\n 0],\n [ 538, 8284, 63, 639, 0, 0, 0, 0, 0,\n 0],\n [ 10, 2696, 60, 1458, 0, 0, 0, 0, 0,\n 0],\n [ 427, 350, 0, 0, 0, 0, 0, 0, 0,\n 0],\n [ 393, 6, 0, 0, 0, 0, 0, 0, 0,\n 0],\n [ 0, 74, 2, 209, 0, 0, 0, 0, 0,\n 0],\n [ 0, 0, 0, 42, 0, 0, 3, 1, 0,\n 0],\n [ 2, 0, 0, 0, 0, 0, 0, 0, 0,\n 0],\n [ 1, 0, 0, 0, 0, 0, 0, 0, 0,\n 0]])\n\n\n\n#### Resampling the data set to counter class imbalance\nAs we have seen above, the neural network has trouble classifying the underrepresented classes correctly. One approach we can try is to bootstrap (sample with replacement) a new dataset from the original data set in which the minority classes are oversampled so that all classes are balanced. We don\'t actually provide any new data but since training a neural network is a stochastic process, we can try to reduce its bias towards the majority class with this method.\n\nLet\'s bootstrap 100000 samples for each of the classes.\n\n\n```python\n# create data frame with 1,000,000 bootstrapped instances, 100,000 from each class\ndf_resampled = pd.concat([\n df.loc[df.target == label, :].sample(100000, replace=True)\n for label in df.target.unique()\n])\n```\n\nAnd split it into train, validation, and test sets again. The suffix _rs stands for resampled.\n\n\n```python\n# split off a test set and stratify for classes (target)\ntrain_full_rs, test_rs = train_test_split(df_resampled,\n stratify=df_resampled["target"],\n test_size=0.2,\n random_state=123)\n\n# split into a training and a validation set and stratify for classes (target)\ntrain_rs, valid_rs = train_test_split(train_full_rs,\n stratify=train_full_rs["target"],\n test_size=0.2,\n random_state=123)\n```\n\nRecompute column means and standard deviations (due to the oversampling of the minority classes, these quantities are quite a bit different now).\n\n\n```python\n# calculate means and standard deviations for the numerical features\nX_means = tf.math.reduce_mean(train_rs.iloc[:, :5].values.astype(np.float32),\n axis=0)\nX_stds = tf.math.reduce_std(train_rs.iloc[:, :5].values.astype(np.float32),\n axis=0)\nprint(X_means, X_stds)\n```\n\n tf.Tensor([7.245467 7.151083 7.5978374 7.1570797 7.494875 ], shape=(5,), dtype=float32) tf.Tensor([3.775464 3.740848 3.7774026 3.7822576 3.6998467], shape=(5,), dtype=float32)\n \n\nConvert the data to `TensorFlow` data set format.\n\n\n```python\nbatch_size = 32\n# create tensorflow data sets from data frames\ntrain_rs_ds = df_to_dataset(train_rs, batch_size=batch_size)\nvalid_rs_ds = df_to_dataset(valid_rs, shuffle=False, batch_size=batch_size)\ntest_rs_ds = df_to_dataset(test_rs, shuffle=False, batch_size=batch_size)\n```\n\nTrain a new model with the resampled data.\n\n\n```python\nopt_model_rs, checkpoint_cb, earlystop_cb = build_opt_model()\n\n# fit the model with trianing and validation data, considering class weights\n# and early stopping (hence 100 epochs probably wont run)\nhistory_rs = opt_model_rs.fit(train_rs_ds,\n steps_per_epoch=len(train_rs) // batch_size,\n validation_data=valid_rs_ds,\n validation_steps=len(valid_rs) // batch_size,\n epochs=100,\n callbacks=[earlystop_cb, checkpoint_cb])\n```\n\n Train for 20000 steps, validate for 5000 steps\n Epoch 1/100\n 19993/20000 [============================>.] - ETA: 0s - loss: 0.7889 - acc: 0.6875\n Epoch 00001: val_loss improved from inf to 0.38209, saving model to training/poker_hand_model.ckpt\n 20000/20000 [==============================] - 213s 11ms/step - loss: 0.7888 - acc: 0.6876 - val_loss: 0.3821 - val_acc: 0.8474\n Epoch 2/100\n 19998/20000 [============================>.] - ETA: 0s - loss: 0.3663 - acc: 0.8611\n Epoch 00002: val_loss did not improve from 0.38209\n 20000/20000 [==============================] - 149s 7ms/step - loss: 0.3663 - acc: 0.8611 - val_loss: 0.9238 - val_acc: 0.9216\n Epoch 3/100\n 19993/20000 [============================>.] - ETA: 0s - loss: 0.2054 - acc: 0.9277\n Epoch 00003: val_loss did not improve from 0.38209\n 20000/20000 [==============================] - 142s 7ms/step - loss: 0.2054 - acc: 0.9278 - val_loss: 184.8528 - val_acc: 0.9716\n Epoch 4/100\n 19992/20000 [============================>.] - ETA: 0s - loss: 0.1384 - acc: 0.9538\n Epoch 00004: val_loss did not improve from 0.38209\n 20000/20000 [==============================] - 153s 8ms/step - loss: 0.1384 - acc: 0.9538 - val_loss: 8.2383 - val_acc: 0.9768\n Epoch 5/100\n 19999/20000 [============================>.] - ETA: 0s - loss: 0.1036 - acc: 0.9666\n Epoch 00005: val_loss did not improve from 0.38209\n 20000/20000 [==============================] - 182s 9ms/step - loss: 0.1036 - acc: 0.9666 - val_loss: 15.8309 - val_acc: 0.9879\n Epoch 6/100\n 19999/20000 [============================>.] - ETA: 0s - loss: 0.0878 - acc: 0.9723Restoring model weights from the end of the best epoch.\n \n Epoch 00006: val_loss did not improve from 0.38209\n 20000/20000 [==============================] - 175s 9ms/step - loss: 0.0877 - acc: 0.9723 - val_loss: 433.6815 - val_acc: 0.9931\n Epoch 00006: early stopping\n \n\n\n```python\n# save the model\nopt_model_rs.save("saved_model/poker_hand_keras_opt_model_rs")\n\n# load the model again\nopt_model_rs = keras.models.load_model("saved_model/poker_hand_keras_opt_model_rs")\n```\n\n INFO:tensorflow:Assets written to: saved_model/poker_hand_keras_opt_model_rs/assets\n \n\nOkay, now let\'s make some predictions again and look at the F1 scores.\n\n\n```python\n# make predictions for the test set (steps = number of instances / batch size)\npred_proba = opt_model_rs.predict(test_rs_ds,steps=6250) \npred_calsses = np.argmax(pred_proba,axis=1)\n \n# calculate the macro-averaged F1 score based on the predictions\nf1_macro = f1_score(test_rs["target"].values,pred_calsses,average="macro")\nf1_micro = f1_score(test_rs["target"].values,pred_calsses,average="micro")\n\nprint(f"F1-macro: {f1_macro:.2f}\\nF1-micro: {f1_micro:.2f}")\nconfusion_matrix(test_rs["target"].values,pred_calsses)\n```\n\n F1-macro: 0.84\n F1-micro: 0.85\n \n\n\n\n\n array([[13296, 4792, 971, 5, 911, 25, 0, 0, 0,\n 0],\n [ 2294, 9522, 4722, 1308, 2064, 8, 82, 0, 0,\n 0],\n [ 14, 2171, 11278, 2226, 2128, 0, 2141, 42, 0,\n 0],\n [ 0, 34, 303, 15868, 28, 0, 2648, 1119, 0,\n 0],\n [ 10, 72, 29, 0, 19889, 0, 0, 0, 0,\n 0],\n [ 0, 0, 0, 0, 0, 19943, 0, 0, 57,\n 0],\n [ 0, 0, 0, 36, 0, 0, 19784, 180, 0,\n 0],\n [ 0, 0, 0, 0, 0, 0, 0, 20000, 0,\n 0],\n [ 0, 0, 0, 0, 0, 0, 0, 0, 20000,\n 0],\n [ 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 20000]])\n\n\n\nNow the micro and macro averaged F1 score are almost identical. The resampling has certainly helped with the class imbalance and slightly improved the overall performance. Looking at the confusion matrix, we can see that the very strongly oversampled minority classes are unsurprisingly all classified correctly because they\'re always the same reoccurring examples, whereas the original majority class still shows several misclassifications, most likely because not all possible examples of these classes were sampled from the original data. \n\n### ResNet CNN for classifiying cats and dogs \nThis Asirra (Animal Species Image Recognition for Restricting Access) data set is a HIP (Human Interactive Proof) that works by asking users to identify photographs of cats and dogs. It contains pictures of cats and dogs which can be used to train an image classifier. This time it does not originate from the UCI machine learning repository, but from a [Kaggle competition](https://www.kaggle.com/c/dogs-vs-cats) which was hosted in 2013. The Jupyter Notebook can be found [here](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository/tree/master/ResNet-CNNs-on-cats-and-dogs).\n\nSince my crappy old laptop only as a weak CPU but I want to use GPUs, I\'m actually running this project on [Google colab](https://colab.research.google.com/).\n\n#### Frame the problem\nWe have a balanced data set of 25000 images of cats and dogs to train a classifier which can tell them apart. While the number of images is not super small, it is still far from enough to easily train any kind of neural network. We will use convolutional neural networks (CNN) which recognize image features much more data-efficiently than fully connected networks. Do artificially enlarge the data set and counter overfitting, we will use data augmentation techniques to modify the images before we pass them into the CNN. Instead of freely experimenting with the CNN architecture, we will choose an architecture which has proven to work very well in past benchmark tests: the ResNet. Furthermore, we will try to train this network from scratch as well as using a network which was trained on the [ImageNet](http://www.image-net.org/) data set to see how much performance we can gain by using pretrained weights. Let\'s get started!\n\n\n```python\n%pip install tensorflow-addons\n%pip install tensorflow-gpu\n# import the libraries we\'ll need\nimport numpy as np \nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set_style("white")\nimport tensorflow as tf\nimport tensorflow.keras as keras\nimport tensorflow_addons as tfa\nfrom sklearn.model_selection import train_test_split\nimport os\n```\n\n#### Preparing the data\nFirst we have to set up a pipeline to get the images and then prepare the image augmentation. Let\'s load the file paths and labels into a data frame first.\n\n\n```python\ndata_path = "./data/train/"\nfilenames = os.listdir(data_path)\nlabels = []\nfor filename in filenames:\n label = filename.split(\'.\')[0]\n if label == \'dog\':\n labels.append("dog")\n else:\n labels.append("cat")\n\ndf = pd.DataFrame({\n \'filename\': filenames,\n \'label\': labels\n})\n```\n\n\n```python\n# let\'s look at a random image\nplt.imshow(keras.preprocessing.image.load_img(data_path+df.iloc[100].filename))\n```\n\n\n\n\n \n\n\n\n\n\n\n\nYep, that looks like a cat. Let\'s first split off training, validation, and test.\n\n\n```python\ntrain_df_full, test_df = train_test_split(df, test_size=0.20, random_state=42)\ntrain_df, validate_df = train_test_split(train_df_full, test_size=0.25, random_state=42)\ntrain_df = train_df.reset_index(drop=True)\nvalidate_df = validate_df.reset_index(drop=True)\ntest_df = test_df.reset_index(drop=True)\n```\n\nLet\'s define some constants which we\'ll use later.\n\n\n```python\nIMAGE_WIDTH=224\nIMAGE_HEIGHT=224\nIMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)\nIMAGE_CHANNELS=3\nCLASS_NAMES = np.array(["dog","cat"])\nBATCH_SIZE=64\nAUTOTUNE = tf.data.experimental.AUTOTUNE\n\n# tell Tensorflow to use XLA (accelerated linear algebra)\ntf.config.optimizer.set_jit(True)\n```\n\nNow we can prepare a generator to stream from for training the model. Keras\' image preprocessing utilities provide a function for this. For the training data, we\'ll implement the augmentation as well.\n\n\n```python\n# The image generator for the training data will apply augmentation operations\n# such as rotation, shear, zoom, shifting and horizontal flipping.\ntrain_datagen = keras.preprocessing.image.ImageDataGenerator(\n rotation_range=10,\n rescale=1./255,\n shear_range=0.1,\n zoom_range=0.2,\n horizontal_flip=True,\n width_shift_range=0.1,\n height_shift_range=0.1\n)\n\n# we will then let the generator stream the data from the filenames \n# stored in the data frames\ntrain_generator = train_datagen.flow_from_dataframe(\n train_df, \n data_path, \n x_col="filename",\n y_col="label",\n target_size=IMAGE_SIZE,\n class_mode=\'categorical\',\n batch_size=BATCH_SIZE\n)\n```\n\n Found 15000 validated image filenames belonging to 2 classes.\n \n\nLet\'s have a look at the augmentations which the generator applies.\n\n\n```python\nexample_df = train_df.sample(n=1).reset_index(drop=True)\nexample_generator = train_datagen.flow_from_dataframe(\n example_df, \n data_path, \n x_col=\'filename\',\n y_col=\'label\',\n target_size=IMAGE_SIZE,\n class_mode=\'categorical\'\n)\n```\n\n Found 1 validated image filenames belonging to 1 classes.\n \n\n\n```python\nplt.figure(figsize=(12, 12))\nfor i in range(0, 15):\n plt.subplot(5, 3, i+1)\n for X_batch, Y_batch in example_generator:\n image = X_batch[0]\n plt.imshow(image)\n break\nplt.tight_layout()\nplt.show()\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/exploring-the-UCI-ML-repository/catdog_output_18_0.png)\n\n\nWhile using the Keras preprocessing utilities is very convenient, it is unfortunately not very fast and well-integrated with the TensorFlow ecosystem. Luckily, we can also apply these augmentation by using tensorflow functions and feeding the data into the model in the TensorFlow data set format.\n\n\n```python\n# load the list of filenames as datasets\ntrain_ds = tf.data.Dataset.from_tensor_slices(data_path + train_df.filename.values)\nvalidate_ds = tf.data.Dataset.from_tensor_slices(data_path + validate_df.filename.values)\ntest_ds = tf.data.Dataset.from_tensor_slices(data_path + test_df.filename.values)\n\n\nfor f in train_ds.take(5):\n print(f.numpy())\n```\n\n b\'./data/train/cat.8410.jpg\'\n b\'./data/train/dog.12008.jpg\'\n b\'./data/train/dog.6125.jpg\'\n b\'./data/train/cat.8437.jpg\'\n b\'./data/train/dog.5051.jpg\'\n \n\nNow we can implement the augmentation functions.\n\n\n```python\n@tf.function(experimental_relax_shapes=True)\ndef augment(img):\n """Apply random horizontal flipping, rotation, shearing, \n shifting, zooming, and change of brightness, contrats,\n and saturation. The scaling factors are all random,\n look at the individual lines to see the boundaries.\n \n Args:\n img: Image\n Returns:\n Augmented image\n """\n \n # horizontal flipping \n img = tf.image.random_flip_left_right(img)\n \n # rotation\n img = tfa.image.rotate(img, tf.random.uniform((1,),-0.2,0.2)[0], interpolation=\'BILINEAR\')\n \n # shearing\n shear_lambda = tf.random.uniform((1,),-0.1,0.1)[0]\n forward_transform = [[1.0,shear_lambda,0],[0,1.0,0],[0,0,1.0]]\n t = tfa.image.transform_ops.matrices_to_flat_transforms(tf.linalg.inv(forward_transform))\n img = tfa.image.transform(img, t, interpolation="BILINEAR")\n \n # shifting\n trans = tf.random.uniform((1,),-0.1,0.1)[0]\n img = tfa.image.translate(img, translations=[trans,trans])\n \n # zoom by cropping and resizing\n offset = tf.random.uniform((1,),0.0,0.1)[0]\n shift = tf.random.uniform((1,),0.9,1.0)[0]\n img_crp = tf.image.crop_and_resize(tf.reshape(img,[1,img.shape[0],img.shape[1],3]), \n boxes=[[offset,offset,shift,shift]],\n box_indices=[0],\n crop_size=[img.shape[0], img.shape[1]])\n img = tf.reshape(img_crp,[img.shape[0],img.shape[1],3])\n \n # change brightness, contrast, and saturation\n img = tf.image.adjust_brightness(img, tf.random.uniform((1,),-0.2,0.2)[0])\n img = tf.image.adjust_contrast(img, contrast_factor=1+tf.random.uniform((1,),-0.1,0.1)[0])\n img = tf.image.adjust_saturation(img,1+tf.random.uniform((1,),-0.2,0.2)[0])\n \n return img\n\n\n```\n\nNow we\'ll have to write some functions to get the labels and convert the images to tensors.\n\n\n```python\n@tf.function(experimental_relax_shapes=True)\ndef get_label(file_path):\n # convert the path to a list of path components\n parts = tf.strings.split(file_path, "/")\n parts = tf.strings.split(parts[-1], ".")\n # note that we output [[1],[0]] for dog and [[0],[1]] for cat\n # because we will fit with binary cross entropy loss.\n # we could also output [[1]] or [[0]] respectively,\n # if we use sparse categorical cross entropy\n if parts[0] == "dog":\n return np.array([1,0]).reshape(-1,1)\n else:\n return np.array([0,1]).reshape(-1,1)\n\n@tf.function \ndef decode_img(img):\n # convert the compressed string to a 3D uint8 tensor\n img = tf.image.decode_jpeg(img, channels=3)\n # Use `convert_image_dtype` to convert to floats in the [0,1] range.\n img = tf.image.convert_image_dtype(img, tf.float32)\n # resize the image to the desired size.\n return tf.image.resize(img, [IMAGE_WIDTH, IMAGE_HEIGHT])\n\n@tf.function\ndef process_path(file_path):\n label = get_label(file_path)\n # load the raw data from the file as a string\n img = tf.io.read_file(file_path)\n img = decode_img(img)\n return img, label\n\n@tf.function\ndef process_path_aug(file_path):\n label = get_label(file_path)\n # load the raw data from the file as a string\n img = tf.io.read_file(file_path)\n img = decode_img(img)\n img = augment(img)\n return img, label\n\ntrain_aug_ds = train_ds.map(process_path_aug, num_parallel_calls=AUTOTUNE)\ntrain_noaug_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)\nvalidate_ds = validate_ds.map(process_path, num_parallel_calls=AUTOTUNE)\ntest_ds = test_ds.map(process_path, num_parallel_calls=AUTOTUNE)\n```\n\nOkay, let\'s have a look if tigs are in the right shape.\n\n\n```python\nfor img, label in train_aug_ds.take(2):\n print("Image shape: ", img.numpy().shape)\n print("Label: ", label.numpy())\n```\n\n Image shape: (224, 224, 3)\n Label: [[0]\n [1]]\n Image shape: (224, 224, 3)\n Label: [[1]\n [0]]\n \n\nAnd let\'s see how the augmented vs. unaugmented image looks like.\n\n\n```python\nfor img, label in train_aug_ds.take(1):\n img1 = img\nfor img, label in train_noaug_ds.take(1):\n img2 = img\n \nplt.figure()\nplt.subplot(1,2,1)\nplt.imshow(img1)\nplt.subplot(1,2,2)\nplt.imshow(img2)\nplt.show()\n```\n\n Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).\n \n\n\n\nNow the last step is to prepare the data sets for training by shuffling, batching, and prefetching.\n\n\n```python\ndef prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):\n # If the data set is too large to fit in memory then use \n # `.cache(filename)` to cache preprocessing work for datasets \n # that don\'t fit in memory.\n if cache:\n if isinstance(cache, str):\n ds = ds.cache(cache)\n else:\n ds = ds.cache()\n\n ds = ds.shuffle(buffer_size=shuffle_buffer_size)\n \n # Repeat forever\n ds = ds.repeat()\n \n ds = ds.batch(BATCH_SIZE)\n \n # `prefetch` lets the dataset fetch batches in the background while the model\n # is training.\n ds = ds.prefetch(buffer_size=AUTOTUNE)\n \n return ds\n\ntrain_aug_ds = prepare_for_training(train_aug_ds)\ntrain_noaug_ds = prepare_for_training(train_noaug_ds)\nvalidate_ds = prepare_for_training(validate_ds)\ntest_ds = prepare_for_training(test_ds)\n```\n\n#### Building the models\nWe can finally focus on the model. As already mentioned in the title of this notebook, we want to use a ResNet or residual network, developed by [He et al.](https://github.com/KaimingHe/deep-residual-networks). This type of network won the ILSVRC 2015 challenge with a top-5 error rate under 3.6%. The special design of this network includes residual units with skip connections. These connections basically skip a layer of the network and feed the signal directly into a layer that is higher up in the stack. Why is that useful at all? When a network is initialized with weights close to zero, it will also output values close to zero. Layers that have not started learning can block the flow of backpropagation in the network which makes it difficult to train very deep networks. With skip connections, however, the network can just output its input and easily propagate a signal.\n\nThe more parameters we want to learn, the more data we need. Since we don\'t have an awful lot of data, we\'ll use a smaller ResNet architecture, the ResNet-34 for the model we\'re building from scratch. Later we\'ll compare it to a pretrained ResNet-50. Let\'s build a class for the residual units first.\n\n\n```python\nfrom functools import partial\n\n# This will be the default convolutional layer with a 3x3 filter moving with a stride of 1 and\n# applying SAME padding (padding the input with zeroes so that the output has the same shape)\nDefaultConv2D = partial(keras.layers.Conv2D, kernel_size=3, strides=1,padding="SAME", use_bias=False)\n\n# this will be the class for a residual unit layer\nclass ResidualUnit(keras.layers.Layer):\n\n def __init__(self, filters, strides=1, activation="relu", **kwargs):\n super().__init__(**kwargs)\n self.activation = keras.activations.get(activation)\n \n # The main layers: 2 convolutional layers (the first one may have a larger stride),\n # with batch normalization after each layer\n self.main_layers = [\n DefaultConv2D(filters, strides=strides),\n keras.layers.BatchNormalization(),\n self.activation,\n DefaultConv2D(filters),\n keras.layers.BatchNormalization()]\n \n # if the stride is 2, the output dimensions will be halved so \n # we need a 1x1 convolutional layer with a stride of 2 to \n # adjust the output dimensions of the skip connection\n self.skip_layers = []\n if strides > 1:\n self.skip_layers = [\n DefaultConv2D(filters, kernel_size=1, strides=strides),\n keras.layers.BatchNormalization()]\n \n def call(self, inputs):\n # propagate the input through the main and skip layers (if present)\n # and return the sum of the two outputs through the activation fucntion\n main = inputs\n for layer in self.main_layers:\n main = layer(main)\n \n skip = inputs\n for layer in self.skip_layers:\n skip = layer(skip)\n \n return self.activation(main + skip)\n```\n\nNow we can use this residual unit to build a residual network with it.\n\n\n```python\n# get the input shape\ninp, _ = next(iter(train_aug_ds.take(1)))\ninput_shape = inp.numpy().shape[1:]\n\n# set up the model\nmodel = keras.models.Sequential()\nmodel.add(keras.layers.InputLayer(input_shape=input_shape))\n\n# initial convolutional layer\nmodel.add(DefaultConv2D(64, kernel_size=7, strides=2))\nmodel.add(keras.layers.BatchNormalization())\nmodel.add(keras.layers.Activation("relu"))\nmodel.add(keras.layers.MaxPool2D(pool_size=3, strides=2, padding="SAME"))\n\n# Every few steps we go deeper down the network, we will double the\n# number of filter maps and reduce the dimensions of the output by\n# applying a stride of 2. \nprev_filters = 64\nfor filters in [64] * 3 + [128] * 4 + [256] * 6 + [512] * 3:\n strides = 1 if filters == prev_filters else 2\n model.add(ResidualUnit(filters, strides=strides))\n prev_filters = filters\n \n# apply global average pooling and feed the output directly into \n# the output layer which has a sigmoid activation\nmodel.add(keras.layers.Dropout(0.1))\nmodel.add(keras.layers.GlobalAvgPool2D())\nmodel.add(keras.layers.Flatten())\nmodel.add(keras.layers.Dense(2, activation="sigmoid"))\n\n# compile the model with Nadam optimizer\noptimizer = keras.optimizers.Nadam()\nmodel.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=["accuracy"])\n\n```\n\n\n```python\nhistory = model.fit(train_aug_ds, \n steps_per_epoch=15000//BATCH_SIZE,\n validation_data=validate_ds,\n validation_steps=5000//BATCH_SIZE,\n epochs=1)\n```\n\n Train for 234 steps, validate for 78 steps\n 234/234 [==============================] - 295s 1s/step - loss: 0.6947 - accuracy: 0.6079 - val_loss: 0.7266 - val_accuracy: 0.5571\n \n\n\n```python\nmodel.save("ResNet34_save")\n```\n\n WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1781: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n Instructions for updating:\n If using Keras pass *_constraint arguments to layers.\n INFO:tensorflow:Assets written to: ResNet34_save/assets\n \n\nOkay, only one period didn\'t give us amazing accuracy. Let\'s train the model until we don\'t gain any additional validation loss improvement by implementing an early stopping callbak.\n\n\n```python\n# early stopping callback to prevent overfitting on the training data\nearlystop_cb = keras.callbacks.EarlyStopping(patience=2,\n min_delta=0.01,\n restore_best_weights=True,\n verbose=1)\nhistory2 = model.fit(train_aug_ds, \n steps_per_epoch=15000//BATCH_SIZE,\n validation_data=validate_ds,\n validation_steps=5000//BATCH_SIZE,\n epochs=100,callbacks=[earlystop_cb])\n```\n\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 262s 1s/step - loss: 0.6049 - accuracy: 0.6701 - val_loss: 0.8103 - val_accuracy: 0.5921\n Epoch 2/100\n 234/234 [==============================] - 248s 1s/step - loss: 0.5449 - accuracy: 0.7241 - val_loss: 0.6400 - val_accuracy: 0.6834\n Epoch 3/100\n 234/234 [==============================] - 249s 1s/step - loss: 0.4874 - accuracy: 0.7682 - val_loss: 0.8053 - val_accuracy: 0.6216\n Epoch 4/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.4247 - accuracy: 0.8076Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 251s 1s/step - loss: 0.4243 - accuracy: 0.8079 - val_loss: 0.7702 - val_accuracy: 0.7420\n Epoch 00004: early stopping\n \n\n\n```python\nmodel.save("ResNet34_save")\n```\n\nThat\'s pretty great already, we got a validation accuracy of about 75%. Now let\'s see if we can even improve that by using a ResNet-50 pretrained on the ImageNet data set. Of course we\'ll have to remove the top layer of this pretrained model and add the one suitable for our classification task. We\'ll first freeze the weights of the lower layers, train the upper layers, and then gradually try to unfreeze some of the lower layers.\n\n\n```python\n# get the pretrained ResNet50\nbase_model = keras.applications.resnet.ResNet50(weights="imagenet",include_top=False)\n```\n\n\n```python\nbase_model = keras.applications.resnet.ResNet50(weights="imagenet",include_top=False)\n# now we\'ll add global average pooling and the output layer on top of the base model\navgpool = keras.layers.GlobalAvgPool2D()(base_model.output)\nflatten = keras.layers.Flatten()(avgpool)\noutput = keras.layers.Dense(2, activation="sigmoid")(flatten)\nmodel2 = keras.models.Model(inputs=base_model.input, outputs=output)\n\n# freeze the layers of the base model\nfor layer in base_model.layers:\n layer.trainable = False\n\n# compile the model with Nadam optimizer and dynamic loss scale\nmodel2.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=["accuracy"])\n```\n\nOkay, let\'s train it.\n\n\n```python\n# early stopping callback to prevent overfitting on the training data\nearlystop_cb = keras.callbacks.EarlyStopping(patience=2,\n min_delta=0.01,\n restore_best_weights=True,\n verbose=1)\n\nhistory3 = model2.fit(train_aug_ds, \n steps_per_epoch=15000//BATCH_SIZE,\n validation_data=validate_ds,\n validation_steps=5000//BATCH_SIZE,\n epochs=100,callbacks=[earlystop_cb])\n```\n\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 276s 1s/step - loss: 0.1193 - accuracy: 0.9547 - val_loss: 1.4036 - val_accuracy: 0.5098\n Epoch 2/100\n 234/234 [==============================] - 246s 1s/step - loss: 0.0752 - accuracy: 0.9704 - val_loss: 1.6608 - val_accuracy: 0.5090\n Epoch 3/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.0705 - accuracy: 0.9735Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 245s 1s/step - loss: 0.0707 - accuracy: 0.9735 - val_loss: 1.4818 - val_accuracy: 0.5094\n Epoch 00003: early stopping\n \n\nMhh, the model severly overfitted on the training data. Let\'s try to unfreeze the layers and introduce a dropout layer.\n\n\n```python\nbase_model = keras.applications.resnet.ResNet50(weights="imagenet",include_top=False)\n# now we\'ll add global average pooling and the output layer on top of the base model\ndropout1 = keras.layers.Dropout(0.5)(base_model.output)\navgpool = keras.layers.GlobalAvgPool2D()(dropout1)\nflatten = keras.layers.Flatten()(avgpool)\ndropout2 = keras.layers.Dropout(0.5)(flatten)\noutput = keras.layers.Dense(2, activation="sigmoid")(dropout2)\nmodel3 = keras.models.Model(inputs=base_model.input, outputs=output)\n\n# freeze the layers of the base model\nfor layer in base_model.layers:\n layer.trainable = False\nfor layer in base_model.layers[-31:]:\n layer.trainable = True\n\n# compile the model with Nadam optimizer and dynamic loss scale\nmodel3.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=["accuracy"])\n\nearlystop_cb = keras.callbacks.EarlyStopping(patience=2,\n min_delta=0.01,\n restore_best_weights=True,\n verbose=1)\n\nhistory4 = model3.fit(train_aug_ds, \n steps_per_epoch=15000//BATCH_SIZE,\n validation_data=validate_ds,\n validation_steps=5000//BATCH_SIZE,\n epochs=100,callbacks=[earlystop_cb])\n```\n\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 281s 1s/step - loss: 0.0807 - accuracy: 0.9694 - val_loss: 0.6941 - val_accuracy: 0.5021\n Epoch 2/100\n 234/234 [==============================] - 250s 1s/step - loss: 0.0461 - accuracy: 0.9820 - val_loss: 0.6952 - val_accuracy: 0.4909\n Epoch 3/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.0361 - accuracy: 0.9863Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 250s 1s/step - loss: 0.0361 - accuracy: 0.9863 - val_loss: 0.6972 - val_accuracy: 0.4895\n Epoch 00003: early stopping\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 281s 1s/step - loss: 0.0807 - accuracy: 0.9694 - val_loss: 0.6941 - val_accuracy: 0.5021\n Epoch 2/100\n 234/234 [==============================] - 250s 1s/step - loss: 0.0461 - accuracy: 0.9820 - val_loss: 0.6952 - val_accuracy: 0.4909\n Epoch 3/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.0361 - accuracy: 0.9863Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 250s 1s/step - loss: 0.0361 - accuracy: 0.9863 - val_loss: 0.6972 - val_accuracy: 0.4895\n Epoch 00003: early stopping\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 281s 1s/step - loss: 0.0807 - accuracy: 0.9694 - val_loss: 0.6941 - val_accuracy: 0.5021\n Epoch 2/100\n 234/234 [==============================] - 250s 1s/step - loss: 0.0461 - accuracy: 0.9820 - val_loss: 0.6952 - val_accuracy: 0.4909\n Epoch 3/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.0361 - accuracy: 0.9863Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 250s 1s/step - loss: 0.0361 - accuracy: 0.9863 - val_loss: 0.6972 - val_accuracy: 0.4895\n Epoch 00003: early stopping\n Train for 234 steps, validate for 78 steps\n Epoch 1/100\n 234/234 [==============================] - 281s 1s/step - loss: 0.0807 - accuracy: 0.9694 - val_loss: 0.6941 - val_accuracy: 0.5021\n Epoch 2/100\n 234/234 [==============================] - 250s 1s/step - loss: 0.0461 - accuracy: 0.9820 - val_loss: 0.6952 - val_accuracy: 0.4909\n Epoch 3/100\n 233/234 [============================>.] - ETA: 0s - loss: 0.0361 - accuracy: 0.9863Restoring model weights from the end of the best epoch.\n 234/234 [==============================] - 250s 1s/step - loss: 0.0361 - accuracy: 0.9863 - val_loss: 0.6972 - val_accuracy: 0.4895\n Epoch 00003: early stopping\n \n\nOkay, look like reusing a pretrained model doesn\'t work well here. The overfitting is too severe. But hey, the model we trained from scrathc was already doing pretty well, so kind of a success :)\n\n### The End\nOkay, that is enough now. I hope you enjoyed going through five less common data sets and examples for their applications. Maybe you even learned something new about machine learning. Thanks for making it all the way through till here!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Predicting the future usage of shared bikes","A great example for a data science interview take-home challenge",new Date("2019-12-16"),"https://raw.githubusercontent.com/Pascal-Bliem/bike-sharing-prediction/master/bikeimg.jpg","Why need a car if you can go by bike?",["Data Science & AI/ML","Learning"],'### Predicting the usage of bike sharing service as a technical data science challenge in the interviewing process\n\nWhen I was in the process of looking for a data science job, besides the actual interviews, some companies gave me a take-home challenge to solve and prove my skills and way of thinking. In this post I present one of these homeworks and how I solved it. I will make future usage predictions for a bike sharing program and answer a couple of questions on the methodology I use for solving a data science problem, as well as on the technologies used. You can also find the project and the data on my [Github](https://github.com/Pascal-Bliem/bike-sharing-prediction) or in an interactive Jupyter Notebook on [mybinder.org](https://mybinder.org/v2/gh/Pascal-Bliem/bike-sharing-prediction/master?filepath=Pascal-Bliem_HDI-Data-Science-Challenge.ipynb) .\n\nI will be using a public data set from [Capital Bikeshare](https://www.capitalbikeshare.com/system-data), which was provided to the [UCI Machine learning repository](http://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset) by Hadi Fanaee-T from the Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto. The slightly modified version of this dataset was provided by HDI. The bike sharing company offers services in Washington D.C., U.S.A., a city with a humid subtropical climate (K\xf6ppen climate classification Cfa) with hot and humid summers and cold winters, which will be good to know later.\n\nIn this notebook I will go through the following 5 questions:\n1. [Usefulness of data for training a predictive model](#q1)\n2. [Discussion of Variables/Features](#q2)\n3. [Training a linear regression model](#q3)\n - [3a. Preprocessing the data](#q3a)\n - [3b. Categorical variables](#q3b)\n - [3c. Dimensionality issues](#q3c)\n - [3d. Most effective features](#q3d)\n - [3e. Model performance](#q3e)\n - [3f. Improving the model](#q3f)\n4. [Query the data with SQL and pandas](#q4)\n5. [Merging existing and new data with SQL and pandas](#q5)\n\n[Conclusion](#con)\n\nBefore I approach these questions, I will first import libraries and explore the data a bit. If you would like to jump to the first question right away, just click [here](#q1).\n\n\n```python\n# import libaries \n\n# for numerical calculations\nimport numpy as np \n# for data handling and processing\nimport pandas as pd \n# for plotting/visualizing data\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set_style("darkgrid")\n%matplotlib inline\nimport warnings\n# ignore annoying warnings for the display here \n# (of course I\'ve read them)\nwarnings.filterwarnings("ignore")\n\n```\n\n### Explore the data set\nLet\'s have a look at the data set first. We can find a description of the variables [online](http://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset) together with the data set:\n\n```\n- instant: record index\n- dteday : date\n- season : season (1:winter, 2:spring, 3:summer, 4:fall)\n- yr : year (0: 2011, 1:2012)\n- mnth : month ( 1 to 12)\n- holiday : weather day is holiday or not (extracted from [Web Link])\n- weekday : day of the week\n- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.\n+ weathersit :\n- 1: Clear, Few clouds, Partly cloudy, Partly cloudy\n- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist\n- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds\n- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog\n- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)\n- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)\n- hum: Normalized humidity. The values are divided to 100 (max)\n- windspeed: Normalized wind speed. The values are divided to 67 (max)\n- casual: count of casual users\n- registered: count of registered users\n- cnt: count of total rental bikes including both casual and registered\n```\nAs we can see, the data set contains information on how many bikes were rented at what time, as well as plenty of situational information on weather and type of day (workday, holiday etc.)\n\nI will first load the data and print some summarizing information.\n\n\n```python\n# load the data from the CSV file into a pandas data frame\ndata = pd.read_csv("DataScience_Challange_bike_share.csv")\n\n# have a look at the first 5 rows\ndata.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
instantdtedayseasonyrmnthholidayweekdayworkingdayweathersittempatemphumwindspeedcasualregisteredcnt
012011-01-0110106020.3441670.3636250.8058330.160446331654985
122011-01-0210100020.3634780.3537390.6960870.248539131670801
232011-01-0310101110.1963640.1894050.4372730.24830912012291349
342011-01-0410102110.2000000.2121220.5904350.16029610814541562
452011-01-0510103110.2269570.2292700.4369570.1869008215181600
\n
\n\n\n\n\n```python\n# get an overview of column names, data types, and number of entries\ndata.info()\n```\n\n \n RangeIndex: 731 entries, 0 to 730\n Data columns (total 16 columns):\n instant 731 non-null int64\n dteday 731 non-null object\n season 731 non-null int64\n yr 731 non-null int64\n mnth 731 non-null int64\n holiday 731 non-null int64\n weekday 731 non-null int64\n workingday 731 non-null int64\n weathersit 731 non-null int64\n temp 731 non-null float64\n atemp 731 non-null float64\n hum 731 non-null float64\n windspeed 731 non-null float64\n casual 731 non-null int64\n registered 731 non-null int64\n cnt 731 non-null int64\n dtypes: float64(4), int64(11), object(1)\n memory usage: 91.5+ KB\n\n\nWe can see that there are 731 instances (rows) in this data set, each row corresponds to one day.\nThe columns show values for the variables representing the previously mentioned information. We can have a look how the number of rentals develops as a function of the date.\n\n\n```python\n# plot the number of bike rentals as a function of the date\ndata.plot("dteday","cnt",rot=20,figsize=(8,6),fontsize=14,legend=False)\nplt.gca().set_xlabel("Date",fontsize=14)\nplt.gca().set_ylabel("Number of bikes rented",fontsize=14)\nplt.tight_layout()\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_7_0.png)\n\n\nWe can see that the dates range from Janurary 2011 till of December 2012, two years. We can also see that the overall trend of bike rentals seems to be increasing and that there are strong local fluctuations as well as a clear seasonal trend (higher number of rentals during the summer months).\n\nLet\'s have a look on how the values of the rest of the variables are distributed by looking at the corresponding histograms.\n\n\n```python\n# plot histograms to display the distribution of the data\ndata.hist(figsize=(10,8))\nplt.tight_layout()\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_9_0.png)\n\n\nWe can see that there are no significant outliers. There are a few interesting things to note, for example, the total count of rentals and registered users are approximately normally distributed, whereas the number of casual users is highly skewed to the lower end of the distribution. Extreme weather situations are rare and temperatures, and probably also humidity, follow bimodal distributions, which is probably related to seasonal weather changes.\n\nNow I will start with answering the questions.\n\n### 1. Usefulness of data for training a predictive model \nThe first question reads:\n"Do you think the included data will be useful in training a predictive model for this purpose? Which variables would you expect to have some effect on the number of shared bikes on a given day? Explain your reasoning."\n\nMost of the provided data seems to have a lot of potential for being useful in training a model. Since riding a bike (and hence, renting one) is an outdoor activity, the count of rentals can be expected to be strongly correlated with weather conditions (`weathersit`, `temp`, `atemp`, `hum`, `windspeed`). People are probably willing to take a bike rather than car or public transportation if the temperatures are not too cold, it\'s not raining or snowing, and the wind is not too strong. As we can see in the plots below, the number of rentals is strongly positively correlated to the feeling temperature and negatively correlated to the wind speed and the harshness of weather situation.\n\n\n```python\n# initialize subplots\nfig, (ax1, ax2, ax3) = plt.subplots(nrows=1,ncols=3,sharey=True,figsize=(15,5))\n\n# plot bike rentals as a function of feeling temperature\ndata.plot.scatter(x="atemp",y="cnt",ax=ax1,fontsize=14)\nax1.set_ylabel("Number of bikes rented",fontsize=14)\nax1.set_xlabel("Feeling temperature (normalized)",fontsize=14)\nax1.set_title(f"Correlation: {data.corr().cnt.atemp:.2f}",fontsize=14)\n\n# plot bike rentals as a function of wind speed\ndata.plot.scatter(x="windspeed",y="cnt",ax=ax2,fontsize=14)\nax2.set_xlabel("Wind speed (normalized)",fontsize=14)\nax2.set_title(f"Correlation: {data.corr().cnt.windspeed:.2f}",fontsize=14)\n\n# plot bike rentals as a function of weather situation\ndata.plot.scatter(x="weathersit",y="cnt",ax=ax3,fontsize=14)\nax3.set_xlabel("Weather situation",fontsize=14)\n_ = ax3.set_title(f"Correlation: {data.corr().cnt.weathersit:.2f}",fontsize=14)\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_12_0.png)\n\n\nAs mentioned before, the months and seasons, and hence the number of bike rentals, will be highly correlated with the weather. In Washington D.C., warmer weather can be expected during the summer months and colder, harsher weather during the winter months. \n\nThe correlation of bike rentals to variables such as day of the week, working day, or holiday, may be less pronounced. During working days, people will likely use bikes for their commute to work. During weekends and holidays, people may be incentivized to use bikes for recreational purpose. This may lead to similar number of rentals for all sorts of days. We can see in the plots below, that the correlations are indeed very small.\n\n\n```python\n# initialize subplots\nfig, (ax1, ax2, ax3) = plt.subplots(nrows=1,ncols=3,sharey=True,figsize=(15,5))\n\n# plot bike rentals as a function of week day\ndata.plot.scatter(x="weekday",y="cnt",ax=ax1,fontsize=14)\nax1.set_ylabel("Number of bikes rented",fontsize=14)\nax1.set_xlabel("Day of the week",fontsize=14)\nax1.set_title(f"Correlation: {data.corr().cnt.weekday:.2f}",fontsize=14)\n\n# plot bike rentals as a function of working day\ndata.plot.scatter(x="workingday",y="cnt",ax=ax2,fontsize=14)\nax2.set_xlabel("Working day (no/yes)",fontsize=14)\nax2.set_title(f"Correlation: {data.corr().cnt.workingday:.2f}",fontsize=14)\n\n# plot bike rentals as a function of holiday\ndata.plot.scatter(x="holiday",y="cnt",ax=ax3,fontsize=14)\nax3.set_xlabel("Holiday (no/yes)",fontsize=14)\n_ = ax3.set_title(f"Correlation: {data.corr().cnt.holiday:.2f}",fontsize=14)\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_14_0.png)\n\n\n### 2. Discussion of Variables/Features \nThe second question reads:\n"Which feature represents the dependent variable and which ones would you consider independent variables for this problem? Which columns can you discard since they do not provide valuable information? Would you expect high correlation between some of the variables? What should we do with these correlated variables?"\n\nThe dependent variable is the one that has to be predicted (and hence, depends on other variables). In this case it is the number of bike rentals, `cnt`. We also have information about how many rentals are from `registered` and `casual` users. These are, of course, dependent variables as well and we can obviously not use them for prediction because that would be a severe case of data leakage (using information that would not actually be available for prediction in a real use case).\n\nThe other variables can be considered as independent variables, as I am not trying to predict them based on other variables. There is, however, a lot of redundant information encoded in them. For example, knowing if it\'s a holiday or weekend also let\'s us know if it\'s a working day or not. Given the month, we also know the season. The year may be useful though, because we have observed a general upwards trend. The feeling (or [apparent](https://en.wikipedia.org/wiki/Apparent_temperature)) temperature is calculated from the temperature, humidity, and wind speed. Some of the information may not be totally redundant though, since e.g. the wind speed may not only influence the apparent temperature, but may also have a separate influence on bike rentals because biking takes more effort with strong headwinds.\n\nSince the term *feature* is more commonly used in a machine learning context, I will say *feature* instead of *variable* in the following. We may remove redundant or uninformative features (such as the day-based information which shows very little correlation to the number of rentals) to reduce the amount of data and speed up the training and prediction. But besides the speed-up, reducing the number of dimensions may result in better performance, which I will discuss in more detail in the answer to question [3c](#q3c). If we have the time and resources, we can treat the choice of features as a hyper-parameter (a parameter that isn\'t learned during training but chose beforehand), train several models with different choices, and see which performs the best. Last but not least, since I will be using a linear regression, I must not include features that a perfectly linearly correlated (multicollinearity), which would violate an [assumption](https://en.wikipedia.org/wiki/Linear_regression#Assumptions) of linear regression because then there would be no unique solution for the estimator parameter vector.\n\nI chose to proceed with the features `yr`, `mnth`, `workingday`, `weathersit`, `windspeed`, and `atemp` to predict the number of rentals, `cnt`, and I will drop the other features for now.\n\n\n```python\n# drop the unneeded features\nselected_data = data.drop(["dteday","instant","season","holiday","weekday","temp","hum","casual","registered"]\n ,axis=1)\nselected_data.head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
yrmnthworkingdayweathersitatempwindspeedcnt
001020.3636250.160446985
101020.3537390.248539801
201110.1894050.2483091349
301110.2121220.1602961562
401110.2292700.1869001600
\n
\n\n\n\nBefore I start training the model, I should split the data into a train, validation, and test set. The reason we need the validation set is to prevent the model from over-fitting on the training data. I need to calculate a score on new data that was not used in training to evaluate the model\'s ability to generalize to unseen data. I will also use this score to optimize the hyper-parameters of the model. Of course, this means there is a risk of slightly over-fitting on the validation set as well. Therefore I will also put aside a test set to evaluate the models final performance on data that it has not seen at any point.\n\nWhen dealing with time series data, I cannot just randomly shuffle instances to split the data set because that would destroy the chronological order (and I don\'t have a time machine). Since there are only 2 years of data, choosing the right split is not trivial. The training data has to reflect the seasonal trend as well as the overall trend, therefore, I allocate the first 70% of data to the train set, 15% to the validation set, and another 15% to the test set. If there was more data over more years available, I would have increased the size of validation and test sets. The features will be denoted as `X`, whereas the target values will be denoted as `y`.\n\n\n```python\n# get index for 70% training data, 15% validation data, 15% test data\ntrain_idx = int(0.7 * len(data))\nvalid_idx = int(0.85 * len(data))\n\n# the training set, features X and targets y\nX_train = selected_data[:train_idx].drop("cnt",axis=1)\ny_train = selected_data[:train_idx].cnt\n\n# the validation set, features X and targets y\nX_valid = selected_data[train_idx:valid_idx].drop("cnt",axis=1)\ny_valid = selected_data[train_idx:valid_idx].cnt\n\n# the test set, features X and targets y\nX_test = selected_data[valid_idx:].drop("cnt",axis=1)\ny_test = selected_data[valid_idx:].cnt\n```\n\n### 3. Training a linear regression model \nThe second question reads:\n"Train a linear regression model on the independent variable using your chosen dependent features. (Please use Python or R)"\n\n\n"a. How do you preprocess your data? Are there missing values? What would you do if you had missing values? When do you think standardization would be necessary?"\n\nLet\'s address the individual points of this question one after the other, starting with missing values. I will have a look if there are any.\n\n\n```python\n# look for missing (NaN encoded) values\nprint(f"Number of missing values:\\n{data.isna().sum()}\\n")\n\n# check the data types\nprint(f"Data types:\\n{data.dtypes}")\n```\n\n Number of missing values:\n instant 0\n dteday 0\n season 0\n yr 0\n mnth 0\n holiday 0\n weekday 0\n workingday 0\n weathersit 0\n temp 0\n atemp 0\n hum 0\n windspeed 0\n casual 0\n registered 0\n cnt 0\n dtype: int64\n \n Data types:\n instant int64\n dteday object\n season int64\n yr int64\n mnth int64\n holiday int64\n weekday int64\n workingday int64\n weathersit int64\n temp float64\n atemp float64\n hum float64\n windspeed float64\n casual int64\n registered int64\n cnt int64\n dtype: object\n\n\nThere are apparently no missing values and none of the columns has a mixed data type, which suggests that there are also no missing values encoded as strings. We have previously seen in the histograms that there are no outliers with suggests that no missing values are encoded as large integer values. Furthermore, the data set documentation does not mention any missing values or their encodings.\n\nIf there were missing values, there are a few strategies to handle them. If there are not to many and it is possible to look up the missing data (e.g. the weather), one can manually fill in the missing values. If there are not too many missing values, one can also consider simply dropping the corresponding instances. Another way could be to impute the values. There are a couple of different ways to impute: for categorical values one could impute with the most frequently occurring value, for numerical values one could impute with the mean or median. For sequential data like time series (like in this example) it may make more sense to consider the chronological aspect and impute values by forward filling, backward filling, or filling by rolling window averages. If the time and resources allow for training several models, one can treat the missing value handling as a part of the hyper-parameter optimization. Note that it is important to use the imputation value that was used for the train set also for the validation and test set.\n\nLet\'s talk about standardization, which means transforming the values of a feature so that the range of all values (usually) has a mean of zero and variance of one. This can have several advantages: Many machine learning algorithms optimize their parameters during training by algorithms form the [gradient decent](https://en.wikipedia.org/wiki/Gradient_descent) family. They calculate the partial gradient of each feature and adjust parameters accordingly to decent that gradient and minimize a specified loss function. If all features are on the same scale, these descending steps are of similar size which usually leads to faster convergence, hence, less training time. Because I am not dealing with many features here, I will actually use an implementation of linear regression that optimizes its parameters by solving the [normal equation](https://en.wikipedia.org/wiki/Linear_least_squares) instead of employing gradient decent, but for scenarios with many features, it may be more efficient to use an implementation with a gradient decent type optimizer.\n\nAnother benefit of standardization with respect to linear models, like linear regression, is that it allows to see the influence of each feature on the predicted value by directly looking at the fitted parameters. In a linear regression, these parameters (often also called coefficients) are scaling factors that determine how much the value of a feature contributes to the predicted values. If the features are on the same scale these coefficients are directly comparable and can tell how much a feature is positively or negatively correlated with the predicted value.\n\nFor these reasons, I will use standardization for the numerical features, which I will implement with `Sci-kit Learn`\'s `StandardScaler`. Note that the scaler will be fitted on the training data and the same learned transformation will be applied to the validation and test data.\n\n\nb. "What about categorical variables? When does label encoding make sense?"\n\nThere are different options for encoding categorical features. The models only take numbers, so if there are any strings as feature values, they would definitely have to be encoded. We do not have any string values in this example. If the categorical features are ordinal, which means the have a natural order, it makes sense to chose an encoding which preserves this order. This applies to the year and month in this example. I would say that it also applies to the weather situation, since the weather get\'s increasingly harsher. \n\nCategorical features that are nominal, which means they have no natural order, can be one-hot encoded. This means that each value the feature can take is represented as its own column which can either take the value 1 if this particular value was present, or 0 if it was not. This applies to the working day feature in this example; either it\'s a working day or not, 1 or 0. Luckily, all features already have a suitable encoding, so there is no need for further preprocessing. If that wouldn\'t be the case, the `Sci-kit Learn` library offers many convenient preprocessing methods such as the `LabelEncoder` for ordinal features and the `OneHotEncoder` for nominal features.\n\nIf a nominal categorical feature can take many possible values, one-hot encoding can add a lot of additional dimensions which may hurt the model\'s performance. This could, for example, be the case in text processing when these values are words in a vocabulary. In such cases, values may be represented by so-called embeddings, which are vector representations. Each value is represented as a dense vector and the location of the vector in the embedding space can be learned unsupervised e.g. by algorithms such as [word2vec](https://en.wikipedia.org/wiki/Word2vec) or [GloVe](https://nlp.stanford.edu/projects/glove/) or in an embedding layer as a part of a supervised model.\n\n\nc. "How many dimensions do you have after preprocessing? Is it too many? Why? How would you reduce the number of dimensions when you look at your training results?"\n\nTo answer this question I will now finally to the preprocessing, meaning the standardization of the numerical features.\n\n\n```python\n# import methods for preprocessing\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.preprocessing import StandardScaler\n\n# set up a transformer that will standardized the numerical features\n# and will do nothing with the already encoded categorical features\npreprocess = ColumnTransformer(transformers=\n [("categorical", "passthrough", ["yr","mnth","workingday","weathersit"]),\n ("numerical", StandardScaler(),["atemp","windspeed"])])\n\n# transform the feature sets\nX_train_pp = pd.DataFrame(preprocess.fit_transform(X_train),\n columns=X_train.columns)\n\nX_valid_pp = pd.DataFrame(preprocess.transform(X_valid),\n columns=X_valid.columns)\n\nX_test_pp = pd.DataFrame(preprocess.transform(X_test),\n columns=X_test.columns)\n\n# let\'s have a look at the first few lines\n\nprint(f"Apparent temperature - mean: {X_train_pp.atemp.mean():.2f}, variance: {X_train_pp.atemp.var():.2f}")\nprint(f"Wind speed - mean: {X_train_pp.windspeed.mean():.2f}, variance: {X_train_pp.windspeed.var():.2f}")\nX_train_pp.head()\n```\n\n Apparent temperature - mean: -0.00, variance: 1.00\n Wind speed - mean: 0.00, variance: 1.00\n\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
yrmnthworkingdayweathersitatempwindspeed
00.01.00.02.0-0.534007-0.457752
10.01.00.02.0-0.5957850.666032
20.01.01.01.0-1.6227110.663098
30.01.01.01.0-1.480752-0.459665
40.01.01.01.0-1.373594-0.120284
\n
\n\n\n\nWe can see that the numeric features have a mean of 0 and a variance of 1 after the standardization. There are still six features left, meaning the feature space has six dimensions. This is certainly not a lot for a modern machine learning problem. Often hundreds or thousands of features may be used for a regression problem. I will train a simple model without any optimization first to get an idea if all of these features are actually informative.\n\n\n```python\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.metrics import mean_absolute_error\n\n# train a linear regression model\nlr = LinearRegression().fit(X_train_pp,y_train)\n# make predictions\ny_pred = lr.predict(X_train_pp)\n# calculate error between predictions and true values\nmae = mean_absolute_error(y_train, y_pred)\n# maximum number of bike rentals on trining data\nmax_num_rent = data.cnt.max()-data.cnt.min()\n\nprint(f"The training mean absolute error is {mae:.2f}, this corresponds to \\\n{mae/max_num_rent*100:.2f}% of the maximum number of rentals in the training data.\\n") \n\nprint(f"Model coefficients:\\n{list(zip(X_train.columns,lr.coef_.tolist()))}")\n```\n\n The training mean absolute error is 604.80, this corresponds to 6.96% of the maximum number of rentals in the training data.\n \n Model coefficients:\n [(\'yr\', 2097.8780274091005), (\'mnth\', 97.97115080615995), (\'workingday\', 76.19387660471806), (\'weathersit\', -663.5996158641194), (\'atemp\', 982.3073091560635), (\'windspeed\', -156.92405815868514)]\n\n\nThe model has a training mean absolute error is 604.80 which is not great but also not absolutely horrible. I\'ll try to improve it later, for now let\'s look at the parameters. They\'re not directly comparable to each other because the categorical features are not standardized, but we can see that none of them is very close to zero which means that none of the features is totally uninformative. The model is probably not suffering from too high dimensionality here.\n\nBut what if the model had a very high dimensionality? In machine learning there is something called the curse of dimensionality. The length of a vector with $i$ dimensions is calculated as $\\sqrt{\\Sigma x_i^2}$, which means that distances between instances increase with increasing dimensionality. The space of training examples is, therefore, more sparsely populated and new data point may be further away from training points which may have a negative impact on the model performance. That is way dimensionality reduction can be a good idea (even though it doesn\'t seem necessary in this example).\n\nThere a couple of different approaches, one being elimination of features. One can either start with one or with all features and then add or subtract features until the model reaches its optimal performance. The most informative features can be chosen, for example, by using univariate statistics. One can see if there is a statistically significant relationship between the features and the target value. The features that have a high relation (low p-value) will be selected. This methods only consider every feature by itself, not the relationship between them. Let\'s look an example where I want to select the 3 best out of 6 features:\n\n\n```python\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import f_regression\nfrom operator import itemgetter\n\n# perform univariate feature selection\nunivar_select = SelectKBest(score_func=f_regression, k=3).fit(X_train_pp,y_train)\n\n# get the p-values for each feature\npvalues = sorted(zip(X_train.columns,univar_select.pvalues_.tolist()),key=itemgetter(1))\n\nprint(f"The three most informative features and their calculated p-values:\\n {pvalues[:3]}")\n```\n\n The three most informative features and their calculated p-values:\n [(\'atemp\', 3.3347350381875324e-58), (\'yr\', 9.239451444912234e-20), (\'weathersit\', 8.276958451168134e-12)]\n\n\nAnother approach is to use a machine learning model itself to assess the feature importance and select features accordingly. As we have seen before, in linear models, the absolute values of the coefficients can be used to assess feature importance, if the features were standardized. Decision tree-based models, such as a [Random Forrest](https://en.wikipedia.org/wiki/Random_forest), can assess feature importance by seeing how many splits in the trees are caused by each feature. Let\'s have a look at an example using a Random Forrest regression:\n\n\n```python\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.ensemble import RandomForestRegressor\n\n# train a random forrest regressor \nrfr = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train,y_train)\n# get the feature importances\nfeat_imp = sorted(zip(X_train.columns,rfr.feature_importances_.tolist()),key=itemgetter(1),reverse=True)\n\nprint(f"The three most informative features and their relative importance:\\n {feat_imp[:3]}")\n\n# this is how we could select the features\nX_train_pp_slct = SelectFromModel(rfr).fit_transform(X_train_pp,y_train)\n```\n\n The three most informative features and their relative importance:\n [(\'atemp\', 0.5614248133121407), (\'yr\', 0.20407565998099822), (\'mnth\', 0.08630387880812807)]\n\n\nNote that these two methods do not agree on the third most important feature. There is another approach to reduce dimensionality applying unsupervised machine learning, namely principal component analysis ([PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)). In PCA, the features are projected on the component that preserve the highest amount of variance, hence information, in the data. That means that features that have a high correlation (such as high wind speeds and cold apparent temperature) may be projected on the same component to a large extend. One can either chose the desired number of principal components or a certain fraction of variance that should be preserved. The resulting components are often not interpretable as something real anymore, but sometimes they capture a general idea, like weather related things here. Let\'s have a look at an example:\n\n\n```python\nfrom sklearn.decomposition import PCA\n\n# perform PCA onto 3 PCs\npca1 = PCA(n_components=3).fit(X_train)\nprint(f"3 PCs preserve {pca1.explained_variance_ratio_.sum()*100:.2F}% of the variance.")\n\n# perform PCA that preserves 95% of the variance\npca2 = PCA(n_components=0.95).fit(X_train)\nprint(f"{len(pca2.components_)} PCs are needed to preserve at least 95% of the variance.")\n```\n\n 3 PCs preserve 98.52% of the variance.\n 2 PCs are needed to preserve at least 95% of the variance.\n\n\n\nd. "Which feature(s) do you think have the most effect on the dependent variable from your training results?"\n\nI have practically answered this question already in the previous question. We have seen that, based on both univariate statistics as well as feature importance derived from a Random Forrest model, the apparent temperature, `atemp`, seems to be the most informative feature, followed by the year, `yr`. This agrees well will the trends we saw in the line graph earlier: there is a strong seasonal trend (more rentals during the warm seasons) and a general upwards trend (higher in the second year).\n\nTo get an idea how much the model is actually benefiting from these features, let\'s have a look at the mean absolute training error without them:\n\n\n```python\n# train the a model with each of the features missing once\nmaes = []\nfor feat in X_train.columns:\n # train a linear regression model without feat\n lr = LinearRegression().fit(X_train_pp.drop(feat,axis=1),y_train)\n # make predictions\n y_pred = lr.predict(X_train_pp.drop(feat,axis=1))\n # calculate error between predictions and true values\n mae = mean_absolute_error(y_train, y_pred)\n # append result to list\n maes.append(mae)\n \n# sort errors \nmaes = sorted(zip(X_train.columns,maes),key=itemgetter(1),reverse=True)\n\n# print results\nprint(f"Training mean absolute error without listed features:")\ndisplay(maes)\n```\n\n Training mean absolute error without listed features:\n\n\n\n [(\'atemp\', 981.1838792292479),\n (\'yr\', 905.057310410891),\n (\'weathersit\', 663.238778333911),\n (\'mnth\', 657.9772239174282),\n (\'windspeed\', 614.1430042853677),\n (\'workingday\', 605.3359655642513)]\n\n\nAs we can see, the models do indeed have the largest errors if the features `atemp` and `yr` are missing.\n\n\ne. "How do you assess the success of your obtained model? Report the Adjusted R-squared, F-statistic and the MS (mean squared error) error on your training and test data."\n\nOf course, we would like to have an idea for how a model is performing. And it is great to have a single number to judge the performance so that it is easy to compare different models. This is particularly import when optimizing hyper-parameters, which I will do in the next question. I have already used the mean absolute error, which is very easy to interpret. It\'s just that, the average of all absolute errors of all predicted data points. Another choice of error is the mean squared error (MSE). Because it squares the errors of all predicted data points, larger errors are taken into account (are penalized) much more than smaller errors. Let\'s calculate the MSE for all data sets:\n\n\n```python\nfrom sklearn.metrics import mean_squared_error\n\n# train a linear regression model\nlr = LinearRegression().fit(X_train_pp,y_train)\n\n# make predictions\ny_train_pred = lr.predict(X_train_pp)\ny_valid_pred = lr.predict(X_valid_pp)\ny_test_pred = lr.predict(X_test_pp)\n\n# calculate errors\nmse_train = mean_squared_error(y_train,y_train_pred)\nmse_valid = mean_squared_error(y_valid,y_valid_pred)\nmse_test = mean_squared_error(y_test,y_test_pred)\n\nprint(f"Training MSE:\\t{mse_train:.2f}\\nValidation MSE:\\t{mse_valid:.2f}\\nTest MSE:\\t{mse_test:.2f}\\n")\n```\n\n Training MSE:\t625278.59\n Validation MSE:\t1012926.19\n Test MSE:\t2024672.44\n \n\n\nWe can see that the validation and test MSEs are much higher that the training MSE, which may be an indicator that the model is over-fitting on the training data and doesn\'t generalize well to new, unseen data. I will try to approach this problem in the next question. Note that the unit of the MSE is $(number-of-bike-rentals)^2$ now and that one would have to take the square root of it to bring it back to an interpretable dimension.\n\nAnother approach to assess the quality of a model is to have a look at how much of the variance in the target data is explained by the predictions of the model. This is what the [coefficient of determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) is for. It is calculated as $R^2 = 1 - \\frac{SS_{err}}{SS_{tot}}$, where $SS_{err}$ and $SS_{tot}$ are the sum of squares of the errors (or residuals) and the total sum of squares, respectively. The sums of squares are proportional to the variances. The closer $R^2$ is to 1, the better the goodness of the model\'s fit. This works well for a univariate regression, but the more independent features are added, the more $R^2$ tends to spuriously increase without actually providing a better fit. That\'s why, for multivariate regression, there is an adjusted $R^2_{adj} = 1-(1-R^2)\\frac{n-1}{n-p-1}$, where $n$ is the number of data points and $p$ is the number of features (plus the intercept term). Let\'s calculate this $R^2_{adj}$ for all data sets:\n\n\n```python\nfrom sklearn.metrics import r2_score\n\n# train a linear regression model\nlr = LinearRegression().fit(X_train_pp,y_train)\n\n# make predictions\ny_train_pred = lr.predict(X_train_pp)\ny_valid_pred = lr.predict(X_valid_pp)\ny_test_pred = lr.predict(X_test_pp)\n\n# calculate adjusted R2 scores\ndef adjR2(X, y_true, y_pred):\n n = len(X) \n p = len(X.columns) + 1 # +1 for the intercept term \n return 1-(1-r2_score(y_true,y_pred))*(n-1)/(n-p-1)\n\nadjR2_train = adjR2(X_train_pp, y_train, y_train_pred)\nadjR2_valid = adjR2(X_valid_pp, y_valid, y_valid_pred)\nadjR2_test = adjR2(X_test_pp, y_test, y_test_pred)\n\n# print results\nprint(f"Training adj. R2:\\t{adjR2_train:.2f}\\nValidation adj. R2:\\t\\\n{adjR2_valid:.2f}\\nTest adj. R2:\\t{adjR2_test:.2f}\\n")\n```\n\n Training adj. R2:\t0.75\n Validation adj. R2:\t-0.48\n Test adj. R2:\t0.46\n \n\n\nWe can see again that the model performs okayish on the training set but lousy on the validation and test data. I mentioned before that it is a good idea to have one scoring metric to compare models and see which performs better. But how do we know if the difference in score between two models is actually meaningful in the sense that it is statistically significant? For example, how do we know if adding another feature would significantly improve the model? The most straightforward comparison would be to the simplest linear regression model, one that only fits the intercept term and all the feature parameters are equal to zero.\n\nWe can perform a statistical [F-test](https://en.wikipedia.org/wiki/F-test) on two models and calculate the resulting F-statistic, $F = \\frac{SS_{err1}-SS_{err2}}{p_2-p_1} / \\frac{SS_{err2}}{n - p_2}$, where $n$ is the number of data points and $p$ is the number of features (plus the intercept term). If this $F$ is larger than a certain critical value (which we can choose), we can consider the difference statistically significant. Since the F-statistic will be distributed following a F-distribution, we can calculate a corresponding p-value. Let\'s calculate it:\n\n\n```python\nfrom sklearn.linear_model import Lasso\n\n# I will use a linear regression with very strong l1 (Lasso) regularization\n# to force all coefficients except the intercept to be equal to 0\nlr_intercept_only = Lasso(100000).fit(X_train_pp,y_train)\n\n# make predictions with the intercept-only model\ny_train_pred_io = lr_intercept_only.predict(X_train_pp)\ny_valid_pred_io = lr_intercept_only.predict(X_valid_pp)\ny_test_pred_io = lr_intercept_only.predict(X_test_pp)\n\n# print coefficients\nprint(f"The intercept-only model has\\nfeature coefficients\\n{lr_intercept_only.coef_}\\n\\\nintercept coefficient\\n{lr_intercept_only.intercept_:.2f}")\n```\n\n The intercept-only model has\n feature coefficients\n [ 0. 0. 0. -0. 0. -0.]\n intercept coefficient\n 3794.21\n\n\n\n```python\nimport scipy\n\n# train a linear regression model\nlr = LinearRegression().fit(X_train_pp,y_train)\n\n# make predictions\ny_train_pred = lr.predict(X_train_pp)\ny_valid_pred = lr.predict(X_valid_pp)\ny_test_pred = lr.predict(X_test_pp)\n\n# calculate the F-statistic and p-value for the two models\ndef f_test(X, y_pred1, y_pred2, y_true):\n # calculate the residual sum of squares\n rss1 = ((y_true - y_pred1)**2).sum()\n rss2 = ((y_true - y_pred2)**2).sum()\n # number of features + intercept\n p1 = 1\n p2 = len(X.columns) + 1\n # number of instaces\n n = len(X)\n # calculate F-statistic\n F = ((rss1-rss2)/(p2-p1)) / (rss2/(n-p2))\n # calculate corresponding upper-tail p-value\n pvalue = 1 - scipy.stats.f.cdf(F, n-p1, n-p2)\n return (F, pvalue)\n \nf_stat_train, pvalue_train = f_test(X_train_pp, y_train_pred_io, y_train_pred, y_train)\nf_stat_valid, pvalue_valid = f_test(X_valid_pp, y_valid_pred_io, y_valid_pred, y_valid)\nf_stat_test, pvalue_test = f_test(X_test_pp, y_test_pred_io, y_test_pred, y_test)\n\n# print the results\nprint(f"Training F-statistic: {f_stat_train:.2f} and p-value: {pvalue_train}\\n\\\nValidation F-statistic: {f_stat_valid:.2f} and p-value: {pvalue_valid}\\n\\\nTest F-statistic: {f_stat_test:.2f} and p-value: {pvalue_test}")\n```\n\n Training F-statistic: 253.39 and p-value: 1.1102230246251565e-16\n Validation F-statistic: 143.78 and p-value: 1.1102230246251565e-16\n Test F-statistic: 42.97 and p-value: 1.1102230246251565e-16\n\n\nIf we would have set the significance level, $\\alpha$, to 0.05 (a common choice), we would conclude that the model including all the features is clearly statistically significantly better on all data sets than the model that only fits the intercept.\n\n\nf. "Plot your predictions against the real values (the x-axis can be the date or the index). What do you think about the results? Is the fit good? How would you recommend finding a better model?"\n\nLet\'s have a look at how well the model\'s predictions match the real data.\n\n\n```python\n# rember this is the model:\n# train a linear regression model\nlr = LinearRegression().fit(X_train_pp,y_train)\n\n# make predictions\ny_train_pred = lr.predict(X_train_pp)\ny_valid_pred = lr.predict(X_valid_pp)\ny_test_pred = lr.predict(X_test_pp)\n\n# set up figure\nfig, ax = plt.subplots(nrows=1,ncols=1,figsize=(8,6))\n# plot original and predicted data\ndata.plot("dteday", "cnt",rot=20,fontsize=14,ax=ax,label="Original data",lw=2,alpha=0.5)\nx_data = ax.get_children()[0].get_xdata()\nax.plot(x_data[:train_idx],y_train_pred,"--",label="Training data fit")\nax.plot(x_data[train_idx:valid_idx],y_valid_pred,"--",label="Validation data fit")\nax.plot(x_data[valid_idx:],y_test_pred,"--",label="Testing data fit")\n# add labels\nax.set_xlabel("Date",fontsize=14)\nax.set_ylabel("Number of bikes rented",fontsize=14)\n_ = ax.legend(fontsize=14)\n\n# print some scores again\nprint(f"Training\\tR2:\\t{r2_score(y_train, y_train_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_train, y_train_pred)}\\n\\\nValidation\\tR2:\\t{r2_score(y_valid, y_valid_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_valid, y_valid_pred)}\\n\\\nTesting\\t\\tR2:\\t{r2_score(y_test, y_test_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_test, y_test_pred)}")\n```\n\n Training\tR2:\t0.75\tMAE:\t604.8020129718502\n Validation\tR2:\t-0.39\tMAE:\t779.5061040738124\n Testing\t\tR2:\t0.49\tMAE:\t1096.6528270267904\n\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_45_1.png)\n\n\nThe model already captures the overall and seasonal trends quite well. However, it still seems to have problems following the strong local fluctuations, for example around July-August 2011 or around August-October 2012.\n\nIf we want to think about improving a model, we need a baseline to compare the new versions to. The simple linear regression above has already provided such a baseline. Another good idea is to make a simple common sense prediction without any fancy modeling. For example, for time series data, the values often show strong local correlations. The number of bikes rented today may be pretty similar to yesterday. Therefore, let\'s see what baseline we get if I just predict the same number as 24 hours earlier.\n\n\n```python\n# predict the same number of rentals as the previous day\ny_true = data.cnt[1:]\ny_pred = data.cnt.shift(1)[1:]\n\n# calculate R2 and MAE\nr2 = r2_score(y_true, y_pred)\nmae = mean_absolute_error(y_true, y_pred)\nprint(f"R2:\\t{r2:.2f}\\nMAE:\\t{mae:.2f}")\n```\n\n R2:\t0.70\n MAE:\t729.88\n\n\nThe simple linear regression model already performs better than this common sense baseline (on the training data, for validation and test data this is not comparable). To further improve the model, I will try two approaches: First I will try to extend the simple linear regression model, then I will try a completely different approach with time series auto-regression.\n\nTo extend the linear regression I will consider adding polynomial features. Linear regression (as the name already tells) just linearly combines feature. If we want to express non-linear dependencies or interactions between features, we can just add these terms as additional features to the linear regression model. Furthermore, I will add regularization. Regularization tries to hinder over-fitting on the training data by restricting the values of the fitted coefficients. This can be done by panelization of the absolute (l1-type, also called lasso) or squared (l2-type, also called ridge) values of the coefficients, or using a combination of both (elastic net type).\n\n\n```python\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import PolynomialFeatures\n\n# write a new preprocessor that creates polynomial features for the numerical features\n# and then standardizes them, categorical feature get just passed through\npreprocess = ColumnTransformer(transformers=[\n ("cat", "passthrough", ["yr","mnth","workingday","weathersit"]),\n ("num", Pipeline(steps=[\n ("poly", PolynomialFeatures(degree=1)),\n ("std", StandardScaler())\n ]),["atemp","windspeed"])])\n\n# combine the preprocsessing with the elastic-net-regularized linear regression in a pipeline\nestimator = Pipeline(steps=[\n ("prep", preprocess),\n ("enlr", ElasticNet(alpha=0.1,l1_ratio=0.5,max_iter=10000000))\n])\n```\n\nThe hyper-parameters to be tuned here are the degree of the polynomial features, the regularization coefficient, alpha, and the ratio between l1 and l2 regularization. \n\nFor hyper-parameter optimization performs one usually performs a search over a given parameter space, looking for the best cross-validation score. A k-fold cross-validation means that a model is trained with the same settings k times but each time a different parts of the training data is put aside as a validation set and the average validation score is calculated. Since we work with time series data, these validation sets can not just chosen randomly (as it is usually done). For time series data, one can perform so-called sliding window or forward-chain cross-validation. For this very small data set, however, there is relatively little training data left to reasonably train a model if these techniques are used. Therefore I split of a validation set in the beginning an I will perform only one validation step on this set. This is arguably not the best way to approach the problem, but I\'ll stick to it here for simplicity.\n\nTo perform the search over the parameter space, one can, e.g., either specify a grid with predefined values or define a range and pick values from it randomly for a certain number of iterations. Neither of these methods is particularly clever because it doesn\'t consider if going in a particular direction of the search space may result in a better score. To build somewhat of a surrogate probability model that tries to go into the best direction of the search space for a given number of iterations, one can use a bayesian search. This is what I will do here. \n\n\n```python\nfrom sklearn.model_selection import PredefinedSplit\nfrom skopt import BayesSearchCV\n\n# we will have to tell the optimizer that we \n# already have a predefined validation set\nvalid_fold_idx = -1 * np.ones(shape=len(selected_data))\nvalid_fold_idx[train_idx:valid_idx] = 0\nps = PredefinedSplit(test_fold=valid_fold_idx)\n\n\n# this will be the search space for the hyper-parameters\nsearch_spaces = {"prep__num__poly__degree": (1, 8), \n "enlr__alpha": (1e-6, 1e+1, "log-uniform"),\n "enlr__l1_ratio": (0.0, 1.0,"uniform")}\n\n# Set up the baysian search with the pipeline estimator that\n# already contains the preprocessing and use the predefined\n# search spaces and validation set split, use 100 iterations.\nopt = BayesSearchCV(\n estimator,\n search_spaces=search_spaces,\n scoring="neg_mean_absolute_error",\n n_iter=100,\n cv=ps\n)\n\nopt.fit(selected_data.drop("cnt",axis=1), selected_data.cnt)\n\n\n# print the best results\nprint(f"Best validation MAE: {-1*opt.best_score_}\\nBest paramters:\\n{opt.best_params_}")\n```\n\n Best validation MAE: 557.5117421560535\n Best paramters:\n {\'enlr__alpha\': 0.0004943602758532629, \'enlr__l1_ratio\': 0.01673465670712895, \'prep__num__poly__degree\': 5}\n\n\nThis does already look very promising. Let\'s compare the current to the previous results.\n\n\n```python\n# make predictions\ny_train_pred = opt.predict(X_train)\ny_valid_pred = opt.predict(X_valid)\ny_test_pred = opt.predict(X_test)\n\n# set up figure\nfig, ax = plt.subplots(nrows=1,ncols=1,figsize=(8,6))\n# plot original and predicted data\ndata.plot("dteday", "cnt",rot=20,fontsize=14,ax=ax,label="Original data",lw=2,alpha=0.5)\nx_data = ax.get_children()[0].get_xdata()\nax.plot(x_data[:train_idx],y_train_pred,"--",label="Training data fit")\nax.plot(x_data[train_idx:valid_idx],y_valid_pred,"--",label="Validation data fit")\nax.plot(x_data[valid_idx:],y_test_pred,"--",label="Testing data fit")\n# add labels\nax.set_xlabel("Date",fontsize=14)\nax.set_ylabel("Number of bikes rented",fontsize=14)\n_ = ax.legend(fontsize=14)\n\n# calculate R2 and MAE and print results\nprint(f"Training\\tR2:\\t{r2_score(y_train, y_train_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_train, y_train_pred)}\\n\\\nValidation\\tR2:\\t{r2_score(y_valid, y_valid_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_valid, y_valid_pred)}\\n\\\nTesting\\t\\tR2:\\t{r2_score(y_test, y_test_pred):.2f}\\tMAE:\\t{mean_absolute_error(y_test, y_test_pred)}")\n```\n\n Training\tR2:\t0.80\tMAE:\t541.7587443724088\n Validation\tR2:\t0.31\tMAE:\t542.4176582870627\n Testing\t\tR2:\t0.65\tMAE:\t923.1428791902093\n\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_53_1.png)\n\n\nAfter hyper-parameter optimization, this improved model clearly outperforms both the common sense baseline as well as the previous simple linear regression model. I is following the local fluctuations much better and the training and validation scores are much better. The test score is better as well, but the error is still much higher than the train or validation scores. That is a clear sign that the optimization has over-fitted the validation set to some degree. That\'s why cross-validation would be a good idea if more data was available.\n\nNow I will try the second approach I mentioned, a time series auto-regression. But first I have to say a few word on stationary time series. Most time series data (and this one here as well) show some overall trend and some form of seasonality trends. That means it is not stationary because statistical properties such as mean and variance do not remain constant over time and the autocovariance likely depends on time as well. For many time series forecasting model we do, however, need a stationary time series to fulfill the assumptions.\n\nWe can test if a time series is stationary with the Dickey-Fuller Test. If the test statistic is less than the a critical value (say 0.05), I will reject the null hypothesis and say that the series is stationary.\n\n\n```python\nfrom statsmodels.tsa.stattools import adfuller\n\n# first convert the date column to datetime data format\ndata.dteday = pd.to_datetime(data.dteday)\n\n# Dickey-Fuller test\nts = data.set_index("dteday").cnt.copy()\ndftest = adfuller(ts, autolag=\'AIC\')\n\n# print result\nprint(f"Test statistic: {dftest[0]}\\np-value: {dftest[1]}")\n```\n\n Test statistic: -1.8774481444374287\n p-value: 0.3427434403392199\n\n\nI will chose 0.05 as a significance level and conclude that, since the p-value of 0.34 is much higher than 0.05, the time series is not stationary (which we kind of knew already from looking at the plots). To make the time series stationary, I will have to remove the overall and seasonal trend. There are, again, a couple of approaches using differencing, aggregations, rolling averages, or polynomial fits, or special seasonal decompositions. I\'ll use the latter.\n\n\n```python\nfrom statsmodels.tsa.seasonal import seasonal_decompose\n\n# perform a seasonal decomposition\ndecomposition = seasonal_decompose(ts)\nresidual = decomposition.resid\n\n# the residuals are the decomposed timseries \nts_decomposed = residual.dropna()\n\n# plot the original data, trend, seasonality, and residuals\n_ = decomposition.plot().gca()\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_58_0.png)\n\n\nNow I will test again if the time series is stationary now.\n\n\n```python\n# perform another Dickey-Fuller test\ndftest = adfuller(ts_decomposed, autolag=\'AIC\')\nprint(f"Test statistic: {dftest[0]}\\np-value: {dftest[1]}")\n```\n\n Test statistic: -11.952547293815142\n p-value: 4.283319262618949e-22\n\n\nThe p-value is far below 0.05 and I conclude that the time series is stationary now. Since in a time series, there is usually a strong dependence among values, one needs to use a statistical models like ARIMA (Auto-Regressive Integrated Moving Averages) to predict the data. First I need to find the correct number of auto-regressive (AR) parameters, p, and moving average (MA) parameters, q. This can be done by looking at plots of the auto-correlation function (ACF) and the partial auto-correlation function (PACF) and see where these functions cross the upper boundaries of (e.g. 0.95) confidence intervals around zero.\n\n\n```python\nfrom statsmodels.tsa.stattools import acf, pacf\n\n# calculate ACF and PACF\nlag_acf = acf(ts_decomposed, nlags=20)\nlag_pacf = pacf(ts_decomposed, nlags=20, method=\'ols\')\n\n# set up subplot\nfig, (ax1, ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,4))\n\n# plot ACF + confidence interval around 0\nax1.plot(lag_acf)\nax1.set_xticks(range(len(lag_acf)))\nax1.axhline(y=0,linestyle=\'--\',c="k")\nax1.axhline(y=-1.96/np.sqrt(len(ts_decomposed)),linestyle=\'--\',c="k")\nax1.axhline(y=1.96/np.sqrt(len(ts_decomposed)),linestyle=\'--\',c="k")\nax1.set_title("ACF")\n\n# plot PACF + confidence interval around 0\nax2.plot(lag_pacf)\nax2.set_xticks(range(len(lag_pacf)))\nax2.axhline(y=0,linestyle=\'--\',c="k")\nax2.axhline(y=-1.96/np.sqrt(len(ts_decomposed)),linestyle=\'--\',c="k")\nax2.axhline(y=1.96/np.sqrt(len(ts_decomposed)),linestyle=\'--\',c="k")\n_ = ax2.set_title("PACF")\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_62_0.png)\n\n\nBoth functions cross the confidence interval at 1, so I will choose p=1 and q=1. I will now fit an ARIMA model to the time series.\n\n\n```python\nfrom statsmodels.tsa.arima_model import ARIMA\n\n# order = (p,d,q) means number or AR parameters, differences, and MA parameters to use\n# only fit on the training data\nmodel_ARIMA = ARIMA(ts.iloc[:train_idx], order=(1, 1, 1), freq="D")\n# -1 means: don\'t print any convergence info\nresults_ARIMA = model_ARIMA.fit(disp=-1)\n\n# make predictions\ny_train_stationary = decomposition.resid[:train_idx]\ny_train_pred_stationary = results_ARIMA.predict(1, train_idx, ts)\ny_train_pred = (results_ARIMA.predict(1, train_idx, ts)\n + decomposition.seasonal\n + decomposition.trend).dropna()\n\n# let\'s put validation and test set together now\ndecomp_validtest = seasonal_decompose(ts[train_idx:])\ny_validtest_stationary = decomp_validtest.resid\ny_validtest_pred_stationary = results_ARIMA.predict(train_idx, len(ts)-1, ts)\ny_validtest_pred = (results_ARIMA.predict(train_idx, len(ts), ts)\n + decomp_validtest.seasonal\n + decomp_validtest.trend).dropna()\n\n\n# set up figure\nfig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))\n# plot original and predicted data\ndata.plot("dteday", "cnt", rot=20, fontsize=14, ax=ax,\n label="Original data", lw=2, alpha=0.5)\nx_data = ax.get_children()[0].get_xdata()[:-2]\nax.plot(x_data[2:train_idx], y_train_pred, "--", label="Training data fit")\nax.plot(x_data[train_idx+2:-2], y_validtest_pred,\n "--", label="Validation + test data fit")\n\n# add labels\nax.set_xlabel("Date", fontsize=14)\nax.set_ylabel("Number of bikes rented", fontsize=14)\n_ = ax.legend(fontsize=14)\n\n# calculate R2 and print results\nprint(f"On the stationary time series:\\n\\\nTraining\\tR2:\\t{r2_score(y_train_stationary[3:], y_train_pred_stationary[2:-1]):.2f}\\tMAE:\\t\\\n{mean_absolute_error(y_train_stationary[3:], y_train_pred_stationary[2:-1])}\\n\\\nValid+test\\tR2:\\t{r2_score(y_validtest_stationary[3:-3], y_validtest_pred_stationary[3:-3]):.2f}\\t\\\nMAE:\\t{mean_absolute_error(y_validtest_stationary[3:-3], y_validtest_pred_stationary[3:-3])}\\n")\n\n\n# calculate R2 and MAE after adding trend as seanonality back in and print results\nprint(f"After adding trend and seasonality back in:\\n\\\nTraining\\tR2:\\t{r2_score(y_train[:-2], y_train_pred):.2f}\\tMAE:\\t\\\n{mean_absolute_error(y_train[:-2], y_train_pred)}\\n\\\nValid+test\\tR2:\\t{r2_score(y_valid.append(y_test)[3:-3], y_validtest_pred):.2f}\\t\\\nMAE:\\t{mean_absolute_error(y_valid.append(y_test)[3:-3], y_validtest_pred)}\\n")\n```\n\n On the stationary time series:\n Training\tR2:\t-0.71\tMAE:\t690.867157099949\n Valid+test\tR2:\t-0.00\tMAE:\t622.5475474206966\n \n After adding trend and seasonality back in:\n Training\tR2:\t0.74\tMAE:\t591.3442957868795\n Valid+test\tR2:\t0.69\tMAE:\t622.5475474206966\n \n\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_64_1.png)\n\n\nAfter adding the trends back in, the time series auto-regression does also capture the overall and seasonal trends but the coefficient of determination is lower and the error is higher compared to the previous linear regression model. Keep in mind that we should rather use the MAE for the stationary time series for comparison because in a real prediction situation, we would not have the trend and seasonality data for the test set and would have to try to extrapolate it from the training data. The worse model performance is not surprising considering that we only used the shape of the time series itself, whereas we used all other features in the linear regression model. The time series auto-regression may, hence, be a good alternative for tasks where not many informative features are available.\n\n### 4. Query the data with SQL and pandas \n"Consider this data was in an SQL table with features as columns. Write the SQL statement to get the average daily number of shared bikes monthly (calculate also variance of daily shared bikes for each month in two years). Do the same with pandas. Plot the distribution of average daily number of shared bikes against month/year (x-axis is the month/year)."\n\nTo query something from an SQL table, I\'ll use the `pandasql` library which allows to query a `pandas` data frame as if it was a SQL table.\n\n\n```python\nimport pandasql as ps\n\n# In the SQL query I group by the months of the two years and select as columns\n# the date (month/year), average amount of bike rents and the variance of bike rents.\n# The since for the variance two aggregation functions (AVG and SUM) are needed, I have\n# to get the average over a sub-query and the sum in the main query.\nquery = """\nSELECT \nSTRFTIME("%m-%Y", dteday) AS date, \nAVG(cnt) AS average,\nSUM((cnt-(SELECT AVG(cnt) FROM data GROUP BY STRFTIME("%m-%Y", dteday)))*\n (cnt-(SELECT AVG(cnt) FROM data GROUP BY STRFTIME("%m-%Y", dteday)))) \n / (COUNT(cnt)-1) AS variance\nFROM data \nGROUP BY STRFTIME("%m-%Y", dteday)\n"""\n\n# execute the query on the "data" data frame\nps.sqldf(query, locals())\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dateaveragevariance
001-20111231.9032261.387061e+05
101-20123120.7741944.448632e+06
202-20111721.9642864.078639e+05
302-20123556.4482766.354654e+06
403-20112065.9677421.022422e+06
503-20125318.5483871.882277e+07
604-20113162.3333334.941022e+06
704-20125807.4666672.337103e+07
805-20114381.3225811.057772e+07
905-20126318.2258062.789599e+07
1006-20114783.7333331.324805e+07
1106-20126761.0000003.253559e+07
1207-20114559.3870971.190374e+07
1307-20126567.9677423.017471e+07
1408-20114409.3870971.108881e+07
1508-20126919.4516133.405701e+07
1609-20114247.2666671.033790e+07
1709-20127285.7666673.887179e+07
1810-20113984.2258069.045461e+06
1910-20126414.2258063.152235e+07
2011-20113405.5666675.509074e+06
2111-20125088.8000001.666490e+07
2212-20112816.8709683.515087e+06
2312-20123990.7419351.111653e+07
\n
\n\n\n\nNow let\'s do the same with `pandas` build-in functions (which I think are a bit more handy):\n\n\n```python\n# first convert the date column to datetime data format\ndata.dteday = pd.to_datetime(data.dteday)\n\n# group the data by the date column with a monthl frequency,\n# apply average (mean) and variance aggregation functions\n# and then select the "cnt" (count of rentals) columns\ndata_agg = data.groupby(pd.Grouper(key="dteday",\n freq="M")).agg(["mean","var"]).cnt.reset_index()\ndata_agg\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dtedaymeanvar
02011-01-311231.9032261.387061e+05
12011-02-281721.9642861.588093e+05
22011-03-312065.9677423.035698e+05
32011-04-303162.3333331.085959e+06
42011-05-314381.3225813.282464e+05
52011-06-304783.7333331.975339e+05
62011-07-314559.3870974.625234e+05
72011-08-314409.3870976.558609e+05
82011-09-304247.2666679.319532e+05
92011-10-313984.2258061.217672e+06
102011-11-303405.5666676.213369e+05
112011-12-312816.8709689.192268e+05
122012-01-313120.7741947.618708e+05
132012-02-293556.4482767.581614e+05
142012-03-315318.5483871.565408e+06
152012-04-305807.4666671.713321e+06
162012-05-316318.2258061.162955e+06
172012-06-306761.0000009.105072e+05
182012-07-316567.9677427.520076e+05
192012-08-316919.4516136.305297e+05
202012-09-307285.7666679.587561e+05
212012-10-316414.2258063.770663e+06
222012-11-305088.8000001.276293e+06
232012-12-313990.7419353.251631e+06
\n
\n\n\n\nNow I will plot the average data as a function of time and use the standard deviation (square root of the variance) to show the spread of the data around the mean.\n\n\n```python\n# set up figure\nfig, ax = plt.subplots(ncols=1, nrows=1, figsize=(8, 6))\n# plot the data\ndata_agg.plot("dteday",\n "mean",\n ax=ax,\n fontsize=14,\n legend=False,\n linewidth=3,\n label="Average daily bike rentals +/- std. dev.")\n# plot the standard deviation band\nax.fill_between(x=ax.get_children()[0].get_xdata(),\n y1=data_agg["mean"] + np.sqrt(data_agg["var"]),\n y2=data_agg["mean"] - np.sqrt(data_agg["var"]),\n alpha=0.2)\n# add labels\nax.set_title("Monthly averages of daily bike rentals $\\pm$ standard deviation",fontsize=14)\nax.set_ylabel("Avergae number of bike rentals",fontsize=14)\n_ = ax.set_xlabel("Date",fontsize=14)\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/bike-sharing-prediction/output_71_0.png)\n\n\n### 5. Merging existing and new data with SQL and pandas \n"Consider you had another file with the following fields:*\n```\n- dteday : date\n- traffic : 0 for low 1 for medium 2 for high\n```\n*Assume you might not have data for all days for this table. How would you merge this\nnew data with the existing data? Explain with pandas and SQL. How would you\npreprocess this new feature before training your linear model?"\n\nFirst I will create this additional data. Since I don\'t know the usual traffic patterns I will make a simplifying assumption and assign the traffic situation randomly. In reality, there may be differences e.g. between working day commuter traffic and weekend/holiday traffic. To account for potentially missing data, I will remove some of the values randomly.\n\n\n```python\n# create randomly assigned traffic data\ntraffic_data = data[["dteday"]].merge(\n pd.DataFrame(np.random.randint(0,3,size=len(data.dteday)),\n columns=["traffic"]),\n left_index=True,\n right_index=True)\n\n# randomly remove (set to NaN) a fraction of 10% of the entries\nidx = np.random.randint(0,len(data.dteday),size=int(len(data.dteday)*0.1))\ntraffic_data.loc[idx,"traffic"] = np.NaN\n\n# have a look at the data\ntraffic_data.sample(15)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
dtedaytraffic
2252011-08-140.0
2012011-07-210.0
2282011-08-170.0
6262012-09-181.0
6282012-09-202.0
1592011-06-091.0
6062012-08-292.0
2812011-10-09NaN
422011-02-122.0
1642011-06-141.0
4622012-04-072.0
6862012-11-17NaN
6462012-10-082.0
6472012-10-090.0
3122011-11-091.0
\n
\n\n\n\nNow I\'ll merge this data with the previous data using SQL.\n\n\n```python\n# In the SQL query I perform an inner join of the two tables "data"\n# and "traffic_data" and select all columns from "data" and the \n# traffic column from "traffic_data"\nquery = """\nSELECT d.*, td.traffic\nFROM data d INNER JOIN traffic_data td\nON d.dteday = td.dteday\n"""\n\n# execute the query and look at the first few lines\nps.sqldf(query, locals()).head(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
instantdtedayseasonyrmnthholidayweekdayworkingdayweathersittempatemphumwindspeedcasualregisteredcnttraffic
012011-01-01 00:00:00.00000010106020.3441670.3636250.8058330.1604463316549851.0
122011-01-02 00:00:00.00000010100020.3634780.3537390.6960870.2485391316708011.0
232011-01-03 00:00:00.00000010101110.1963640.1894050.4372730.248309120122913490.0
342011-01-04 00:00:00.00000010102110.2000000.2121220.5904350.160296108145415620.0
452011-01-05 00:00:00.00000010103110.2269570.2292700.4369570.18690082151816000.0
562011-01-06 00:00:00.00000010104110.2043480.2332090.5182610.08956588151816062.0
672011-01-07 00:00:00.00000010105120.1965220.2088390.4986960.168726148136215100.0
782011-01-08 00:00:00.00000010106020.1650000.1622540.5358330.266804688919590.0
892011-01-09 00:00:00.00000010100010.1383330.1161750.4341670.361950547688221.0
9102011-01-10 00:00:00.00000010101110.1508330.1508880.4829170.22326741128013212.0
\n
\n\n\n\nNow I\'ll do the same thing again with `pandas`.\n\n\n```python\n# merge the two data frames\nmerged_data = data.merge(traffic_data,left_on="dteday",right_on="dteday")\nmerged_data.head(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
instantdtedayseasonyrmnthholidayweekdayworkingdayweathersittempatemphumwindspeedcasualregisteredcnttraffic
012011-01-0110106020.3441670.3636250.8058330.1604463316549851.0
122011-01-0210100020.3634780.3537390.6960870.2485391316708011.0
232011-01-0310101110.1963640.1894050.4372730.248309120122913490.0
342011-01-0410102110.2000000.2121220.5904350.160296108145415620.0
452011-01-0510103110.2269570.2292700.4369570.18690082151816000.0
562011-01-0610104110.2043480.2332090.5182610.08956588151816062.0
672011-01-0710105120.1965220.2088390.4986960.168726148136215100.0
782011-01-0810106020.1650000.1622540.5358330.266804688919590.0
892011-01-0910100010.1383330.1161750.4341670.361950547688221.0
9102011-01-1010101110.1508330.1508880.4829170.22326741128013212.0
\n
\n\n\n\nIf this data should be used in a machine learning model, one has to consider that the `traffic` feature contains missing values.\n\n\n```python\nprint("Number of missing values in the traffic column:",merged_data.traffic.isna().sum())\n```\n\n Number of missing values in the traffic column: 69\n\n\nI already discussed how missing values can be handled in question [3a](#q3a). We could either drop these instances or try to impute the missing values by the most frequent value, the mean, or the median. Since the values are ordinal and not nominal or continuous, I think the median makes most sense. Forward, backward, or rolling window filling may also be an option. Let\'s have a look how that could be done:\n\n\n```python\n# Scikit-learn offers a SimpleImputer\nfrom sklearn.impute import SimpleImputer\n\n# impute with median value\nimp_traffic = SimpleImputer(strategy="median").fit_transform(merged_data[["traffic"]])\n\nprint("Missing values:",len(imp_traffic[imp_traffic==np.NaN]))\n```\n\n Missing values: 0\n\n\nThis can also be done with `pandas` in various ways:\n\n\n```python\n# first get the missing values, then impute them\n\n# impute with the median\nmissing = merged_data.traffic[merged_data.traffic.isna()].copy()\nmissing[:] = merged_data.traffic.median()\n\n# impute working day and non-workingday differently\n# working days\nmissing_wd = merged_data.loc[(merged_data.workingday==1)&merged_data.traffic.isna(),\n "traffic"].copy()\nmissing_wd[:] = merged_data.loc[(merged_data.workingday==1),\n "traffic"].median()\n# non-working days\nmissing_nwd = merged_data.loc[(merged_data.workingday==0)&merged_data.traffic.isna(),\n "traffic"].copy()\nmissing_nwd[:] = merged_data.loc[(merged_data.workingday==0),\n "traffic"].median()\n\n# use forward or backward filling\n# forward\nmissing = merged_data.traffic.copy().fillna(method="ffill")\n# backward\nmissing = merged_data.traffic.copy().fillna(method="bfill")\n\n# fill with a rolling window median\nrolling_median = merged_data.traffic.rolling(window=6, min_periods=1).median()\nmissing = merged_data.traffic[merged_data.traffic.isna()].copy()\nmissing[:] = rolling_median.iloc[missing[:].index]\n```\n\nWhich strategy for handling missing values performs best depends on the structure of the real data. Anyway, it can be considered for hyper-parameter optimization.\n\n### Conclusion \n\nIn this technical challenge I discussed many topics around making future predictions for a bike rentals of a bike sharing program. I talked about usefulness of data for training a predictive model, the treatment of variables/features, training a linear regression model, as well as querying and merging data with SQL and pandas.\n\nI hope I provided some interesting information and I would be glad if you would consider me for the position.\n\nIf you would like to see more of my work, feel free to have a look at projects on my [GitHub](https://github.com/Pascal-Bliem), for example my end-to-end data science project on the latest [European Social Survey](https://github.com/Pascal-Bliem/european-social-survey), a discussion on [error rate control](https://github.com/Pascal-Bliem/error-control-statistical-tests) in statistical testing, or my exploration of specific smaller [machine learning topics](https://github.com/Pascal-Bliem/exploring-the-UCI-ML-repository) on lesser know data sets.\n\nThanks a lot for following me all the way though this notebook!\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Error rate control in statistical significance testing","How to perform statistical tests efficiently without fooling yourself",new Date("2019-11-02"),"https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/effectsize_and_errors.png","Let's get to know the interactions of effect sizes, sample sizes, power, and errors in statistical tests!",["Data Science & AI/ML","Learning"],"In this post I'll describe a project which worked on to improve my own understanding of errors and error control in classic frequentist statistics. You can also find this project on my [Github](https://github.com/Pascal-Bliem/error-control-statistical-tests/) or in an [interactive Jupyter Notebook on mybinder.org](https://mybinder.org/v2/gh/Pascal-Bliem/error-control-statistical-tests/master?filepath=Error_control_in_statistics.ipynb). The latter might be the nicest way to consume this post because it contains interactive visualizations.\n\n\nIn the following, I will discuss the important aspects of proper error control in statistical significance testing with the help of many simulated test studies and visualizations. Understanding these aspects is crucial to ensure that results of statistical inference are actually meaningful and that the potential error does not get bigger than what the experimenter deems as acceptable. A non-proper treatment of error control and large amounts of resulting false positive findings has led to a reproducibility crisis in some fields of academic research as described in the now famous publication [*Why most published research is wrong*](https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.0020124). Less publicly visible, but similarly harmful are the same problems in the context of studies and experimentation within industry. Companies that want to leverage statistical inference to better understand their customers or improve their processes will fail to produce the expected benefits if they are not aware of the errors they are making.\n\n> Richard Feynman: *\"The first principle is that you must not fool yourself - and you are the easiest person to fool.*\" \n\nI will discuss the following aspects here:\n1. [An introduction to statistical testing](#testing) \n - [Motivation](#testing1)\n - [Null hypothesis significance tests](#testing2)\n2. [P-values and significance levels](#pvalue)\n - [The p-value](#pvalue1)\n - [Significance levels](#pvalue2)\n3. [Error types and statistical power](#error)\n - [The two types of errors](#error1)f\n - [Relationship between errors](#error2)\n - [P-values as a function of power](#error3)\n5. [Error correction for multiple tests](#multiple)\n - [False positive inflation by multiple tests](#multiple0)\n - [Bonferroni correction](#multiple1)\n - [Holm correction](#multiple2)\n - [False discovery rate](#multiple3)\n6. [Error correction for optional stopping](#stop)\n - [P-value as a function of sample size](#stop1)\n - [False positive inflation by optional stopping](#stop2)\n - [Pocock and O'Brien-Fleming boundaries](#stop3)\n - [Alpha spending function](#stop4)\n7. [Power increase and error comparison](#power)\n - [Attempts to avoid false negatives](#power1)\n - [False negative and positive comparison](#power2)\n - [Confirmatory analysis and pre-registration](#power3)\n8. [Conclusion](#conclusion)\n\nOne more note: I was inspired to write up this topic and to perform some simulation studies for myself to gain a better understanding after taking the very recommendable course *Improving your statistical inferences* by Daniel Lakens of TU Eindhoven on [coursera.org](https://www.coursera.org/learn/statistical-inferences?). Daniel is a great teacher and I want to acknowledge the effort he put in this course.\n\n## Statistical significance testing \n### Motivation \nIn the real world, most processes are not perfectly deterministic. We expect certain outcomes with certain probabilities, it is normal to observe some spread in the quantities we are studying. You have good days, bad days, and anything in between. That also means that if we are observing differences or correlations between quantities of interest, we need to find out if the difference or correlation is caused by an actual underlying effect, or if it is just due to random fluctuations. We want to know if the observed effect is **significant**. \n\nAn example: You make changes on your e-commerce website which intend to make it easier for your customers to put articles in their shopping cart and check out. You do, in fact, observe a slight increase in conversion rate. But is that really due to the changes you made, or could it be just due to random chance? Is the effect you observe statistically significant?\n\nIf we consider something as significant, it basically means that we would consider the observation as very surprising or unlikely if there was no real underlying effect. To put these idea in a mathematical framework, statistical hypotheses tests are used. This is also referred to as confirmatory analysis, because we are testing hypotheses which were probably generated in an exploratory part of the research process. I'll elaborate a bit on why it is good to keep these parts separated later on.\n\n### Null hypothesis significance tests \nThe most common kind of test approach is a null hypothesis significance test. The null hypothesis reflects the assumption that there is no real effect (i.e. no difference, no correlation, or whatever we are interested in). Opposed is the alternative hypothesis, stating that there is an effect different from zero (can also be specified as smaller or larger). We know how observations should be distributed if the null was true, and so, we can see how surprising the observed data would be if the null was true. If the data would be very surprising under the null, we reject the null in favor of the alternative. \n\nHow surprising does it have to be? I'll discuss the border line for significance, called significance levels, in the next section. But we'll have to calculate a test statistic from the data first, which could then either cross this border or not. Depending on what we want to test for, how many groups we consider, and what assumptions we make about the distribution of the data, there are a lot of parametric and non-parametric tests we can chose from. I want to keep the discussion as general as possible so I will not go into detail here. Nonetheless, one has to be aware that it is important to chose an appropriate test statistic. If assumptions of the test are violated, the results can be very wrong.\n\nBesides null hypothesis significance tests there are also equivalence tests. Testing against the null hypothesis will just tell you if there is an effect different from zero. This effect may still be smaller than what you care about, than what you would consider as \"equivalent\". These tests check not only if there is an effect at all, but also if it is outside of the equivalence range you define. Since not everyone knows what effect sizes to expect and null hypothesis significance tests are much more common, I will focus on the latter in this discussion.\n\n## P-values and significance levels \n\nLets first talk about what we actually mean by **probabilities** in this context. Null hypothesis significance tests are part of the framework of frequentist statistics. As the name already implies, probabilities assigned to observed data are considered as longterm frequencies of the data. If we would repeat our sampling process an infinite amount of times, the frequency with which the observed data would appear, would correspond to its probability under the given hypothesis. It is important to note that we consider the probability of data under the assumption of a hypothesis and **not** the probability of any hypothesis being true or false. This is a different idea of probability as e.g. in Bayesian statistics, where we interpret probability as a degree of believe in a hypothesis.\n\n### The p-value \nIf we want to know how surprising the observed data is under the null hypothesis, we can use the test statistic which was calculated from the data, compare it to the distribution under the null, and calculate the probability of observing a value as extreme as or more extreme than the observed value. This probability is called the **p-value**. It is the probability of getting the observed data or even more extreme data, given that the null hypothesis is true. It **is not** the probability of the theory or the hypothesis being true or false. I'm stressing this here because this often confused in practice.\n\nAn example: We want to know if the mean of a sample is significantly different form the mean of a population (e.g. sales to a specific group of customers compared to all the customers). If the sample was from the population, the differences in sample mean and population mean should be normally distributed around zero. Due to sampling error, the difference will usually not be exactly zero, but we expect it to not be very large either.\n\nLet's look at the figure below. We can see how observations would be distributed under the null hypothesis, $H_0$. We can also see one observation and a red shaded area under the distribution. This (integrated) shaded area corresponds to the probability of observing data as extreme or more extreme than this observation. This probability is the p-value.\n\n![Probability of the data under the null hypothesis.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/probability_of_data_under_h0.svg)\n\n### Significance levels \nWe can see in the figure above that the more extreme the data wold be under the null, the smaller its probability (its p-value). We can now freely pick a boundary beyond which we consider the data too unlikely to appear under the null. This boundary is called the **significance level, $\\alpha$**. It is very often chosen to be 0.05, but in principle one can chose any value that one considers reasonable for the given situation. If the p-value is smaller than $\\alpha$, we will **reject** the null hypothesis in favor of the alternative hypothesis. If the p-value is larger than $\\alpha$, we will **fail to reject** the null hypothesis.\n\nYou can find this concept visualized in the figure below. Given a significance level of $\\alpha$ = 0.05, we will retain $H_0$ if the observed data falls within the 95% most likely values (white-shaded area). We will reject $H_0$ if the observed data falls within the 5% least likely values (red-shaded area). In this case we consider extreme values both on the low and the high end but one can also conduct one-sided test to either side.\n\n![A visualization of the the significance level.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/significance_level_alpha005.svg)\n\nAgain, the p-value **is not** the probability of the theory or the hypothesis being true or false. Probabilities in a frequentist context are the longterm frequency of the data, under an assumed hypothesis. P-values do not tell you truth about something; in every explicit example, every individual test the null is either true or not. There either is an effect or not. P-values should only be used to separate signal from noise and to guide behavior in the long run to ensure a proper control of the error rates, which is what I will discuss in the next section.\n\n## Error types and statistical power \n\n### The two types of errors \nLet's think about what can happen in a null hypothesis significance test. Either the null or the alternative hypothesis is true. If the null is true and we do not find a statistically significant effect we got a **true negative**. If the alternative hypothesis is true, we find statistically significant effect and reject $H_0$, we got a **true positive**. But what if we were wrong? We can find a significant effect even though there actually is no effect. That is a **false positive** or **type 1 error**. We can also fail to detect a significant effect even though there is one. That is a **false negative** or **type 2 error**.\n\nAs discussed above, by defining $\\alpha$, we define how unlikely data has to be under $H_0$ for us to reject $H_0$. This way we also defined the false positive rate. If $\\alpha$ = 0.05 that means that in maximally 5% of the cases, the data that makes us reject $H_0$ could actually have occurred under $H_0$ and we rejected it incorrectly. At $\\alpha$ = 0.05, at most 1 out of 20 tests could, hence, be a false positive. We see an effect where there actually is none.\n\nIf the data is not surprising under the null, it does not necessarily mean that there is no true effect. Maybe there is none, but maybe we have just failed to detect it and gotten a false negative. The false negative error rate is usually denoted as $\\beta$. The quantity 1-$\\beta$ is called **power** and it represents the probability of detecting an effect when it is actually there, i.e. obtaining a true positive. The higher the power, the more likely it is to detect a real effect and, consequently, the less likely to obtain a false negative.\n\n### Relationship between errors \nThe power itself depends on the chosen significance level, $\\alpha$, the effect size, $d$, and the sample size, $n$. I have visualized this relationship in the interactive figure below. The distributions under the null, $H_0$, and the alternative, $H_a$, hypotheses are displayed. The centers of the distributions are separated by the effect size, $d$. The shaded areas represent the probabilities of true negatives (white), false positives (red), false negatives (blue), and true positives or power (green). \n\n![True negatives (white), false positives (red), false negatives (blue), and true positives or power (green), as a function of effect size.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/effectsize_and_errors.png)\n\nThe less these two distributions overlap, the smaller the errors. We can chose the false positive rate $\\alpha$ freely, but the smaller we set it, the more we push the $H_0$-rejection-boundary into the distribution of $H_a$, thereby, reducing power. The larger the sample size, the smaller the spread of the distributions (e.g. the standard error of the mean scales with $\\sqrt{n}^{-1}$), increasing the power. The smaller the effect size, the closer the distributions come together, decreasing power.\n\nIf we don't want to compromise on $\\alpha$ and we cannot influence the actual $d$, we have to chose the smallest $d$ that seems relevant to us and chose the sample size accordingly to achieve the desired power. Small effects need larger samples. Is there a real effect or not? If the p-value is over 0.05 you cannot answer this question. Maybe there is none, maybe your study was just under-powered.\n\nIf you design your studies with power in mind, you will not make more errors in the long run than the rates you control for. It will, however, not tell you if an individual finding is true or false - this is one of the reasons why replication studies are so important in experimentation. Errors will occur by design. We cannot avoid that, but if we replicate, we will at least be able to detect them in hindsight. If you're already tired of me mentioning over and over again how frequentist statistics is all about long-term, infinite-repetition-kind-of concepts, you should probably garb a Bayesian stats book.\n\n### P-values as a function of power \nIf we still want to stick to frequentist hypothesis testing and the idea of probabilities as long term frequencies, we will probably wonder what distribution of p-values to expect in the long run, given a certain effect.\n \nI took a simulation approach in this section. That's actually always a great idea if possible; simulate the process you are interested in, and compare your observations with the simulation's output. In this case I simulated 10000 Student's t-tests on samples of size 100, each sampled from normal distributions separated by $d$. I repeated this simulation for different values for $d$, each corresponding to powers in the range of 0 to 1 in steps of 0.1, at $\\alpha$ = 0.05.\n\nWe can see histograms of the p-values values coming out of the 10000 simulated t-tests below. For a power of 0 corresponding to $d$ = 0 (no difference, $H_0$ is true), we can see that the p-values are uniformly distributed between 0 and 1. We can get any p-value with equal probability. Now, this makes a lot of sense in relation to $\\alpha$, the significance level or false positive rate. If we choose $\\alpha$ = 0.05, we would declare every p-value below 0.05 (shaded in red) as statistically significant and reject the null. As we can see here, where $H_0$ is true, we would make a mistake 5% of the time because under $H_0$ being true, 5% of the simulated p-values are lower or equal to 0.05.\n\n![Simulated p-values at different powers.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/pvalue_function_of_power.png)\n\nIf we increase the power, we introduce a real effect, which means now $H_0$ is false. In this case we would like to get a significant result so that we can reject $H_0$. At low powers, however, there are still plenty of p-values above the significance threshold. The higher the power, the more p-values will fall under $\\alpha$. E.g. at a power of 0.5, ca. 5000 out of 10000 simulated p-values are lower than $\\alpha$. This is the intuition behind the definition of power: the probability of detecting an effect if there actually is an effect. If you want to be sure to not miss the smallest effect you care about, make sure you have sufficient power by having a large enough sample size.\n\nThe higher the power, the more very small p-values can be expected if $H_a$ was true. That also means that when you know that you have a very high power, not all p-values below $\\alpha$ should be seen as an evidence for $H_a$. At very high power (e.g. 0.99), under $H_a$, a p-value of e.g. 0.001 would be much more likely than 0.049. You may then conclude that p=0.049 would be unlikely under $H_0$, but it would be even more unlikely under $H_a$. This is also known as [Lindley's paradox](https://en.wikipedia.org/wiki/Lindley's_paradox).\n\n## Error correction for multiple tests \n\n### False positive inflation \nAs I have discussed above, we will set the false positive rate by setting the significance level $\\alpha$. If we chose to set $\\alpha$=0.05, we are willing to accept that one out of 20 findings might be a false positive. This naturally implies that, the more tests we conduct, the higher the chance that false positives will be among them. The overall significance level $\\alpha$ would change to $\\alpha = 1 - (1-\\alpha_i)^n$, where $\\alpha_i$ is the significance level for the individual tests and $n$ is the number of tests. That means if we conduct multiple comparisons within one study and we want to keep the studies' overall error rate at the desired level, we will have to control the family-wise error rate of all tests involved. We will have to not just apply the overall $\\alpha$ to every test individually, but adjust every individual test's $\\alpha_i$ so that the overall $\\alpha$ is controlled. \n\nAn example: Imagine you're conducting a multi-factor ANOVA test in which you want to see which influence several factors could have on an observed quantity, both alone and in interaction with each other. You may, e.g. be observing people's life expectancy and testing if there is a significant correlation with gender (only male or female here for simplicity), smoking (yes or no), and marriage status (married or not), or with the 2-way interaction terms (e.g male and smoking). In this setup with 3 factors, considering 2-way interactions and 1-way main effects, you'd already be conducting 6 tests at once and you'd get 6 p-values. If you'd keep $\\alpha$=0.05 for each of these p-values, the family-wise error rate would be much higher than 0.05.\n\nI simulated this scenario (ANOVA with 2-way interactions and 1-way main effects) with 3 to 7 factors resulting in 6 to 28 tests per study. I sampled the dependent variable randomly from a normal distribution, which means there is no real effect. $\xa7H_0\xa7$ is true for all these simulated examples. As you can see in the figure below, much more than 5% of the 10000 simulated studies will show some significant results. The uncorrected multiple comparisons have inflated the family-wise error rate.\n\n![The family wise error rate gets inflated by performing multiple tests without correction.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/pvalue_inflation_multiple_tests.png)\n\n### Bonferroni correction \nLuckily, there are a couple of methods with which the false positive rate can be controlled when doing multiple comparisons. The most straight forward method is probably the Bonferroni correction (which should maybe rather be attributed to [Dunn](https://en.wikipedia.org/wiki/Olive_Jean_Dunn)). I couldn't find an original paper, but the idea is simple: the significance level, $\\alpha$, is just divided by the number of tests. Alternative, each test's p-value can be multiplied by the number of tests, keeping the original $\\alpha$. P-values larger than one do not make any sense, so the adjusted p-values will be clipped off at a maximum of one.\n\nWe can see in the figure below how the Bonferroni correction strictly controls the family-wise error rate for any number of tests. The large bin at 1.0 results from adjusted p-values being larger than 1.0 clipped off at 1.0.\n\n![Applying the Bonferroni-correction strictly controls the family-wise error rate for multiple tests.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/bonferroni_correction.png)\n\nThe Bonferroni correction, however, assumes statistical independence between all tests, which is usually not the case in studies where many tested factor could be related to each other. In that case, the Bonferroni correction is too conservative in the sense that it may reduces the individual tests' $\\alpha_i$ more than necessary. This can be a problem because it reduces the power of the tests, makes them less likely to detect an effect if there is one, thereby, increasing the chance of false negatives.\n\n### Holm correction \nThere is another option, which addresses this problem. Actually, there are plenty of other options (find an overview [here](https://en.wikipedia.org/wiki/Family-wise_error_rate#Controlling_procedures)), but I'll only discuss the most common ones to not explode the scope of this work. The next one I want to talk about is the [Holm correction](https://www.jstor.org/stable/4615733?seq=1#page_scan_tab_contents), which is a bit like a step-down sequential Bonferroni correction. The p-values are first reverse-ranked and then each $\\alpha_i$ is divided by the corresponding p-value's reverse rank (i.e. the $\\alpha_i$ corresponding to the smallest p-value is divided by the highest rank). Once we fail to reject a $H_i$, we can stop testing because the following ones cannot be significant either. We can, of course, also adjust the p-values instead of the $\\alpha_i$ by multiplying them with their reverse rank or setting them equal the cumulative maximum of adjusted p-values. That means if $p_1$ (the smallest $p_i$) multiplied with its rank is already larger than $p_2$ multiplied with its rank, the adjusted value for $p_2$ will be equal to $p_1$ multiplied with its rank. You can find the equation [here](https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method#Adjusted_p-values) if the verbal explanation is a little confusing.\n\nThe advantage of Holm over Bonferroni is that it is uniformly more powerful. It doesn't lower each individual test's $\\alpha_i$ as much and hence, does not reduce its power as much. It does not inflate the false negative rate that much while still controlling the false positive rate quite well. In the figure below you can see the distribution of Holm-corrected p-values from the same simulated studies as before.\n\n![The Holm-correction does not inflate the false negative rate that much while still controlling the false positive rate quite well.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/holm_correction.png)\n\n### False discovery rate \nThe Bonferroni and Holm correction methods aim for controlling the **family-wise error rate**.\nThat means making **at least one** false positive per study. It can be considered as an error rate per experiment or per study. Another approach for controlling false positives is provided by controlling the **false discovery rate**. This aims at controlling the overall proportion of discoveries that are false directly, and not at controlling the rate of experiments which may be flawed with at least one false positive.\n\nThe probably most common procedure of this kind was proposed by [Benjamini and Hochberg](https://www.jstor.org/stable/2346101?seq=1#page_scan_tab_contents). It works a bit like in the Holm method but bottom-up. The p-values are sorted and ranked. Then starting from the highest p-value, the adjusted p-value is the lower option of either, $p_i * n / i$, meaning the $i^{th}$ p-value, $p_i$, multiplied by the number of tests, $n$, divided by its rank, $i$, or the cumulative minimum of the previously adjusted p-values. That means if one of the previously adjusted p-values was already lower than the adjusted $p_i$, then adjust $p_i$ to that lower value. This [video](https://www.youtube.com/watch?v=K8LQSvtjcEo) walks you through the idea step-by-step in case this short summary sounds a little opaque.\n\nThe figure below shows the distribution of p-values adjusted with the false discovery rate (Benjamini-Hochberg) method for the same 10000 simulated studies as before in which $H_0$ is true.\n\n![Controlling the false discovery rate means controlling the overall proportion of false discoveries rather than the rate of experiments which may have at least one false positive.](.images/fdr_correction.png)\n\nCompared to the family-wise error control methods, the false discovery rate provides larger power but often higher false positive rates. Which method you want to chose really depends on the application. In my personal experience, both academics and people in industry are not as diligent in replicating experiments as the maybe should be and, hence, I like the idea of a more stringent false positive control. But in many high-throughput experiments where many variables to test can be extracted from small-sized samples (e.g. genetic sequencing), power can already be intrinsically low. It may then be a good idea to use the false discovery rate as a higher-powered correction method rather than the family-wise error methods.\n\n## Error correction for optional stopping \n\nAs we now know, we need sufficiently large samples if we want to have a high power. But collecting more data points is often very time and money consuming. During the collection period we may wonder if the information we need is already in the data that was collected up till now. In situations like medical trials, coming to a true positive result earlier could even save lives. We could peek and do a test already, maybe the p-value will be lower than $\\alpha$ already. But hey, that's dangerous. We would basically be performing multiple tests again, inflating the false positive rate. The different methods we can use to do this correctly are generally summarized under the term **sequential analysis**. I will discuss the basic ideas and most commonly used correction methods here, but I also want to point you to a [nice introductory paper](http://datacolada.org/wp-content/uploads/2015/12/5367-Lakens-EJSP-2014-Performing-high-power.pdf) by Daniel Lakens with more details, references, and some real-world examples.\n\n### P-value as a function of sample size \nLet's remember from before that if there is no effect, $H_0$ is true, then p-values are uniformly distributed. So if we perform a test after every data point collected, the p-value should be below 0.05 about 5% of the time. We will inevitably find a p-value below $\\alpha$ at some point. The more frequent we look, the higher the chance that a least one of the looks will yield a significant p-value and, if we then stop data collection, we will produce a false positive.\n\nI set up a simulation where samples are randomly drawn from normal distributions and a Student's t-test is performed on them. You can see some of the results in the figure below where I changed the sample size, $n$, and effect size, $d$. For $d$=0 there is no effect, $H_0$ is true. There will still be around 5% cases in which the p-value drops below 0.05. Now you can understand why \"peeking\" can be dangerous without correction. You might accidentally peek while the p-value has randomly dropped below $\\alpha$.\n\n![The p-values observed when \"peeking\" as a function of sample size at different effect sizes. Even if there is no real effect, the p-value will occasionally drop below the significance level.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/pvalue_function_of_samplesize.png)\n\nIf you increase $d$ above zero, there is a real effect, $H_0$ is false. If we can detect the effect or not depends on the power. As discussed before, power depends on $\\alpha$, $d$, and $n$. At $d$=0.3 and a sample size $n$=90, the power is a bit higher than 50% which means the p-value should drop below $\\alpha$ in a bit more than half the cases.\n\nIf you increase the sample size and keep the effect size low, you can observe that sometimes the p-value drops below $\\alpha$ but then rises above $\\alpha$ until it finally drops again. This shows that even if there is a real effect, if we don't know the exact effect size and can calculate the required power in advance, it can be hard to know when exactly what sample size is really sufficient.\n\n### False positive inflation by optional stopping \nNow, is there anything we can do about optional stopping while the data collection is still running without completely messing up the false positive rate? Well, I guess if there wasn't, I wouldn't be writing this paragraph. But before we discuss methods to correct $\\alpha$, let's try to understand how it inflates when **optional stopping** is performed. Optional stopping means that we will already perform interim tests (looks) before the entire final sample size is collected. If one of these interim tests yields a significant p-value, we conclude a significant result already and stop data collection. It is a bit more complicated than for the case of independent multiple tests because the probabilities are conditional now: The probability of finding a significant result at the second look also depends on the probability of not finding a significant result at the first look. \n\nI simulated (this time 100000 times) the situation of performing Student's t-tests with optional stopping on samples of size 100 which are randomly drawn from the same normal distribution, which means there is no real effect, $H_0$ is true. For each simulated study, 2, 4, or 5 looks are performed. If the p-value is below $\\alpha$ at one of the looks, this p-value is considered for this particular simulation. If not, the p-value of the final look is considered.\n\nYou can see the results in the figure below. I have chosen smaller bins now so that the structure of the p-value distribution is visible more clearly. You can see that if optional stopping is performed, values that should have been above the significance level get pulled below it and contribute to the false positive rate. The more looks per study are allowed, the more the false positive rate gets inflated.\n\n![Taking additional looks to perform optional stopping inflates the error rate if no correction is performed.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/optional_stopping.png)\n\n### Pocock and O'Brien-Fleming boundaries \nTo correct the significance levels or p-values for the individual looks, some boundary methods were proposed. The idea is that, under $H_0$, for $n$ interim tests/looks, the conditional probabilities of obtaining a significant p-value at each $n^{th}$ look, considering the probabilities of not obtaining a significant p-value at the previous looks, all have to add up to the overall desired significance level, $\\alpha$. That basically means that the $\\alpha$ is being spent over the course of the interim tests in a way that the overall $\\alpha$ is controlled to the desired level.\n\nThe [Pocock boundary](https://academic.oup.com/biomet/article-abstract/64/2/191/384776?redirectedFrom=fulltext) correction method adjust the $\\alpha_i$ for each individual interim test to the same value (or alternatively multiplies each $p_i$ with $\\alpha/\\alpha_i$). You can find tabulated values for different numbers of interim tests [here](https://newonlinecourses.science.psu.edu/stat509/node/80/). E.g., for five interim tests, each $\\alpha_i$=0.0158. You can see how the Pocock correction controls the false positive rate in the figure below.\n\n![The Pocock boundary correction adjusts each interim significance level to control the overall error rate.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/pocock_correction.png)\n\nSince the Pocock method has the same $\\alpha_i$ for each interim test, this means that it spends relatively much $\\alpha$ at the earlier looks. It allows for a relatively loose significance level at the early stages of data collection and hence, is more likely to allow for early termination of data collection. However, at later stages when more data is collected and real effects should be easier to detect, the significance level is relatively strict so that it is harder to conclude significance in the final stage. An example: if a final test yields a p-value of 0.02, it would have been significant if no sequential analysis was performed, but in the case of a Pocock corrected study with 5 interim tests, the significance level for the final test is final 0.0158 so that the overall study has to be seen as insignificant. \n\nThis goes a bit against the intuition that with more data, it should be easier to obtain significant results if there is a true effect. The [O'Brien-Fleming boundary](https://www.jstor.org/stable/2530245?seq=1#page_scan_tab_contents) (OBF) addresses this issue by spending less $\\alpha$ at the earlier interim tests and more towards the end. Of course, that makes it much more difficult to stop at an earlier stage. For five interim tests, the OBF-corrected $\\alpha_i$ are $\\alpha_1$=0.000005, $\\alpha_2$=0.0013, $\\alpha_3$=0.0085, $\\alpha_4$=0.0228, and $\\alpha_5$=0.0417. You can see in the figure below how the OBF correction manages to control the false positive rate as well.\n\n![The O'Brien-Fleming boundary correction controls the error rate by spending more alpha towards the end of the experiment.](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/error-rates-statistics/obf_correction.png)\n\n### Alpha spending function \nWhen we run simulations as above, we can just chose how many interim tests we want to do and in many properly planned experiments, this can be scheduled in advance as well. But there may be situations in which it is not clear in advance how many looks should be performed. Maybe data collection will be easier or cheaper at the beginning or the end of the collection process. Maybe it also makes sense to not have an equal spacing of data points between the looks. The boundary methods discussed above do not provide this flexibility.\n\n[DeMets and Lan](https://eclass.uoa.gr/modules/document/file.php/MATH301/PracticalSession3/LanDeMets.pdf) have developed the alpha spending function approach to address this inflexibility. They proposed continuous approximation of the Pocock or OBF boundaries (there are also linear and power-family functions) to calculate how much $\\alpha$ is spent if one performs an interim test. I don't want to go in too much more detail here, as I assume the results of simulations with this correction would not look much different from what I've already simulated above. If, however, you are in a situation where you'd like to calculate your alpha spending flexibly, there is a [package in R](https://github.com/rpahl/GroupSeq) called `GroupSeq` for this purpose.\n\nA final remark about optional stopping: If a study is stopped early, obviously that means the sample size is smaller. Maybe a statistically significant result was found, but the quality of the effect size estimate will be lower. I don't want to discuss effect sizes (I discuss them in [another project](https://github.com/Pascal-Bliem/european-social-survey) of mine) and confidence intervals here because that would inflate the scope of this project too much. Be aware, however, that effect sizes are usually biased upwards for small samples and that the confidence intervals around the effect size estimates are much larger for smaller samples, which means there is more uncertainty associated with the estimate.\n\n## Power increase and error comparison \n\n### Attempts to avoid false negatives \nI have actually been discussing false negatives already a lot in all the previous section when talking about power. Remember, power is 1-$\\beta$, where $\\beta$ is the false negative rate (also called type 2 error). False negative error control is generally a bit trickier because the power always depends on the actual effect size which is usually not known in advance. We can try to define the power for the smallest effect size we care about and define the sample size we need to achieve it at the given significance level, $\\alpha$, and just go with that. But the actual power will still depend on the size of the actual effect. It is hard to quantify exactly what your false negative rate will be, but there a a few things that can be done to lower it. \n\nThe first and best thing to do is always increase the sample size. The higher the sample size the higher the power, the less biased and more accurate the effect size estimate. Another strategy is trying to keep the measurement error as low as possible to decrease the variability in the data. As displayed in the interactive plot above showing $H_0$ and $H_a$, the less the two distributions overlap, the higher the power will be. Some examples: if you measure some natural phenomenon, perform several measurements per subject; if you survey people, base their score on several questions with wide enough response options; if you track the users on your website, maybe use a success metric that is based on several user actions. When testing different conditions on people, one can greatly reduce variability by using a within-subject design instead of a between-subject design (testing different conditions on the same group, before and after, instead of on different groups). If one already has a directional hypothesis (e.g. increase in quantity of interest), one can perform one-sided instead of two-sided tests. In a one-sided test, the critical value corresponding to the same $\\alpha$ will be further away from $H_a$ and, hence, the power will be higher.\n\n\n \n### False negatives and false positives in comparison \nWhich errors are worse? False positives (type 1) or false negatives (type 2)? Again, it depends. What is the cost of making an error? If we think about the academic scientific process, false positives will be uncovered if replication studies are performed. Unfortunately, that does not happen as much as it should. False negatives could be considered worse though, because they could kill the interest of anyone doing further research in that direction. \n\nHow is it in other fields? In medicine, there are many situations where we would rather want to detect all actual positives at the cost of having some extra false positives. Imagine you perform some quick test to check for a dangerous illness - you'd rather trigger a false alarm that leads to more rigorous but unnecessary follow-up investigations than missing the illness which results in the patient's death. In many business contexts it may be the other way around. If you don't implement an actually cool new feature on your website because it didn't pass the significance test (false negative), your customer won't know and won't care. But if you think some new feature is gonna be great but the customers actually hate it (false positive), you'll lose a lot of money. Think of recommendation systems: If you fail to show some of the interesting products, no one will notice; but if you show a lot of things people are not interested in, they won't be as engaged with your site.\n\nAs always, it depends. That is not an overly satisfying answer, I know. In practice, one will have to find a satisfactory trade-off between what is desirable and what is feasible. From a psychological point of view (don't quote me, I'm not a psychologist), I think humans are more prone to make false positive errors because we want to find interesting results. There are countless example in the scientific literature that show that people pull a lot of stunts, being very flexible in their data analysis, introduce new covariates, sometimes even perform p-hacking, to torture the p-value below 0.05. I want to point, again, to the paper I already mentioned in the introduction, [*Why most published research is wrong*](https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.0020124). I see no reason why this should behave differently in experimentation in a business/industry context, it just doesn't usually become public. Hence, I think rigorous false positive control is very important and that's why I devoted most of this whole discussion here to this topic.\n\n \n### Confirmatory analysis and pre-registration \n \nAnother phenomenon that can inflate false positive rates is an insufficient separation of **exploratory and confirmatory analysis**. This often leads to hypothesizing after exploring the data. People see something in the data that looks interesting, build a hypothesis on it, and then turn the story around and make it look like that was the hypothesis they had in the first place and their data now confirms it. Often, such apparently significant insights from exploration will fail to replicate in a confirmation study. Just don't use the same data for exploration and then confirmation of your exploratory findings, it doesn't make sense.\n\nI think this idea is comparable to a common problem in machine learning: overfitting to the training data and then being surprised that the model generalizes poorly to new data. That's why we keep a hold-out data set in machine learning problems. We split it off the training data before we even start exploratory data analysis to make sure we can test the model's generalization performance on data which neither us humans nor the algorithm have seen before. I think the same idea applies to statistical testing. If we build and test hypotheses on the same data, it may be like overfitting on training data. Of course data exploration is necessary, but we should come up with a well defined hypothesis and test design in the exploratory phase and then take unused data to conduct confirmatory analysis and tests. This way, we can be (a little more) sure that we don't fool ourselves.\n\nIf you want to confirm a theory or hypothesis without fooling yourself (too much), a great idea is to **pre-register** your methodology (e.g. with a scientific journal if you work in academia or with the experimentation team at your company). Pre-registering the test design after exploration and then actually sticking with it during confirmation may seem a little bit inflexible, but it formalizes your false positive rate. And hey, that's what this whole work here is about, right - **error control**.\n\n## Conclusion \nIn this project I discussed some essential aspects of error control in statistical testing. After an [introduction](#testing) to the topic, I explained the meaning of [p-values and significance levels](#pvalue), as well as the different [types of errors](#error) and power in null hypothesis significance testing. Further, I discussed how false positive error rates can be inflated in situations where [multiple tests](#multiple) or sequential analysis with [optional stopping](#stop) are performed, and how one can apply different correction methods to control the false positive rate in these situations. Finally, I highlighted a few strategies for reducing the false negative error rate, compared the impact of both false positives and false negatives in different situations, and argued for a separation of exploratory and confirmatory analysis.\n\nThat was a lot of information. This whole project got a lot longer and more detailed than I anticipated and I learned a lot through researching the many error control methods. If you made it all the way down here, I hope you learned something as well and found it as exciting as I did. Thanks a lot for your interest and following along. If you are generally interested in the application of statistical models on real data, you may like my project on the [European Social Survey](https://github.com/Pascal-Bliem/european-social-survey), in which I go through data selection and cleaning, interactive exploration, statistical testing, and survey answer prediction with different machine learning models. \n\nI wish you happy testing with low error rates!\n")},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Insights from the European Social Survey 8","An end-to-end data science project exploring social trends, attitudes, and behaviours throughout Europe",new Date("2019-10-08"),"https://www.sciencespo.fr/sites/default/files/shutterstock_309509678_6.jpg","Let's explore social trends, attitudes, and behaviours throughout Europe!",["Data Science & AI/ML"],'In this full-stack data science project, we will analyze the 8th European Social Survey. In case you want to explore this project more interactively, you can find the Jupyter Notebook and the data on my [Github](https://github.com/Pascal-Bliem/european-social-survey). After a contextual [introduction](#introduction), we will start with [data selection and cleaning](#data), followed by interactive [visualization and exploration](#viz). In the second half of the project, we will gain deeper insights by employing [statistical inference](#stats) and [machine learning](#ml) methods.\n\n### Introduction \nEurope is a vast and diverse continent and its population is a fascinating subject for sociological studies. The [European Social Survey](https://www.europeansocialsurvey.org/) provides great data sets for exactly that purpose. According to their own description, "The European Social Survey (ESS) is an academically driven cross-national survey that has been conducted across Europe since its establishment in 2001. Every two years, face-to-face interviews are conducted with newly selected, cross-sectional samples. The survey measures the attitudes, beliefs and behaviour patterns of diverse populations in more than thirty nations.". This data set contains the results of the 8th round of the ESS surveyed in 2016/17.\n\nWe may wonder, how do the citizens of European countries think about certain social issues? What about climate change, politics, immigration, individual well-being? How do people\'s opinions and beliefs differ within a country or among the countries? I\'m personally interested in these questions because I have been roaming around Asia for a long time but soon I may return to Europe. I\'ve been quite disconnected from my home country but I heard the horror stories about anti-democratic political parties on the rise, anti-immigration tendencies, people denying climate change, and so on. One can easily get the impression from the media that many unhappy or dissatisfied about a lot of things. That\'s scary enough to make me want to check some actual numbers instead of listening to all the flashy headlines. Follow along while I dig into this data set to see what I should expect when I get back home to Europe soon. Oh, one more thing before we start: for whatever reason Israel took part in this survey; last time I checked Israel was not in Europe, but hey, welcome on board friends.\n\n### Table of contents\n1. [Introduction](#introduction)\n2. [Data selection and cleaning](#data)\n3. [Exploration and visualization](#viz)\n * [Creating the visualization](#viz1)\n * [Exploring trends](#viz2) \n4. [Hypothesis testing and effect sizes](#stats)\n * [Theoretical background](#stats1)\n * [Performing tests](#stats2)\n5. [Predicting answers with machine learning](#ml)\n * [Data preparation](#ml1)\n * [Interpretation of linear models](#ml2)\n * [Interpretation of gradient-boosted ensembles](#ml3)\n6. [Conclusion](#conclusion)\n\n### Data selection and cleaning \nLoad the libraries we\'ll need and have a look at the data files.\n\n\n```python\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport scipy.stats # statistical functions\n# plotting\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set_style("darkgrid")\n%matplotlib notebook\n\n# avoid displaying warnings in notebook (still visible in console)\nimport warnings\nwarnings.filterwarnings(\'ignore\')\n```\n\n\n```python\n# start with loading all available data into a data frame\ndf = pd.read_csv("./ESS8e02.1_F1.csv")\nprint("That\'s all the data:")\ndf.info()\n```\n\n```python\nThat\'s all the data:\n\nRangeIndex: 44387 entries, 0 to 44386\nColumns: 534 entries, name to pweight\ndtypes: float64(226), int64(298), object(10)\nmemory usage: 180.8+ MB\n``` \n\nWe can see that we are dealing with a fairly large data set. The 534 columns each represent one variable corresponding to a question asked in the survey. We have 44387 rows, each corresponding to one surveyed person. \n\nWhen dealing with a lot of numbers, it can be too easy to forget what is actually behind them - so at this point we should remind ourselves that each of these rows corresponds to a real human being who took the time for this survey. A human who answered all these questions, some of which were probably hard to reflect on. And in some of these rows, we will probably be able to read some really sad stories and personal tragedies. We should be grateful towards these people and appreciate their contribution to our scientific understanding, and we should use the knowledge we gain from this data to contribute to the well-being of all people.\n\nNow, regarding the **data selection and cleaning**, there is a lot to do before we can use the data properly, so let\'s go through some important points:\n- The variables are grouped in topics such as media and social trust, politics, subjective well-being, social issues, climate change etc. which contain important infomation for us, but also administrative variables (such as interview dates) and interview code variables which I will not consider here. There are also variables for specific countries (e.g. about specific political parties) which I will not consider either.\n- There are sampling stratification weights for each person and population weights for each country given which should usually be applied when calculating average aggregated values. I will not use them here though, because I am also interested in the whole distribution of the data, the spread of answers given, and multiplying these weights would greatly distort individuals\' responses.\n- The ordinal answers to questions are not all on the same scale (e.g. 1 to 5 vs. 0 to 10) and some of the scales seem to be in reverse order with respect to how the corresponding question was phrased. Maybe this was done to avoid leading questions in the survey. I will reverse the order of these scale when it makes sense to me. Please refer to the survey documentation for more details.\n- Invalid answers (e.g. refusal, don\'t know, missing) are sometimes encoded with numbers 66, 77, 99 for the larger scales, sometimes with 6, 7, 8, 9 for the smaller scales, sometimes it\'s mixed. Generally, there are a lot of different ways of how the question encoding was designed. This may be the result of many different groups of people working on this survey.\n\nDue to this many quite specific tasks, I manually went through the documentation, retrieved (hopefully) all the information needed for the cleaning, and saved it in a file named `variables.csv`. Let\'s load this file first.\n\n\n```python\nall_variables = pd.read_csv("./variables.csv")\nall_variables.sample(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameLabelCountry_specificScale_typeTypeFormatValidInvalidQuestionGroup
144rlgatndHow often attend religious services apart from...noordinaldiscretenumeric-2.044038349C16 Apart from special occasions such as weddi...Group Subjective well-being, social exclusion,...
405edlvpdsePartner\'s highest level of education, Swedenyesnominaldiscretenumeric-4.0100143386F44SE What is the highest level of education y...Group Socio-demographics
364nacer2Industry, NACE rev.2nonominaldiscretenumeric-3.0399454442F31 What does/did the firm/organisation you wo...Group Socio-demographics
417dngnappPartner doing last 7 days: not applicablenobinarydiscretenumeric-1.0443870F45a Which of the descriptions on this card ap...Group Socio-demographics
232ub50unpSomeone in their 50s, unemployment benefit if ...noordinaldiscretenumeric-1.01000234385ub50unp?Group Welfare attitudes
278rshipa4Fourth person in household: relationship to re...nonominaldiscretenumeric-2.01125733130F44 Looking at this card, what relationship is...Group Gender, Year of birth and Household grid
121rlgdnisReligion or denomination belonging to at prese...yesnominaldiscretenumeric-4.041343974C12IS Which one? (Iceland)Group Subjective well-being, social exclusion,...
128rlgdnmeReligion or denomination belonging to in the pastnonominaldiscretenumeric-2.0449439893C14 Which one?Group Subjective well-being, social exclusion,...
145prayHow often pray apart from at religious servicesnoordinaldiscretenumeric-2.043411976C17 Apart from when you are at religious servi...Group Subjective well-being, social exclusion,...
370uemp5yrAny period of unemployment and work seeking wi...nobinarydiscretenumeric-1.01240631981F38 Have any of these periods been within the ...Group Socio-demographics
\n
\n\n\n\nThere we got the variables with their names and a short version of the question as a label. We can see if they are continious or discrete, ordinal or nominal or binary. I am interested in seeing trends so I want to focus on the questions with an ordinal scale (some thing like "On a scale of 0 to 10, how much do you agree with..."). The binary yes-or-no questions are not so useful for my purpose so I won\'t consider them. I will also not consider the nominal answers (e.g. occupation, educational degree, political party voted for, country of birth, maritial status etc.) with a few exceptions: I want to see differences among the countries (`cntry`) and among genders (`gndr`, only 9 people in the whole survey did not categorize themselves as either male or female so we will consider only these two genders here). I will also include the continious variables age (`agea`), years of full-time education completed (`eduyrs`), daily news consumption in minutes (`nwspol`), daily internet usage in minutes(`netustm`).\n\n\n```python\n# get the variables that are not country specific, ordinal, and not part of the admistrative group\nordinal = all_variables.query("Country_specific == \\"no\\" & Scale_type == \\"ordinal\\" and Group != \\"Group Administrative variables\\"")\n\n# get the continous variables mentioned above \ncontinious = all_variables.query( "Name in [\\"agea\\",\\"eduyrs\\",\\"nwspol\\",\\"netustm\\"]")\n\n# get the nominal variables mentioned above \nnominal = all_variables.query( "Name in [\\"cntry\\",\\"gndr\\"]")\n\n# append them to one data frame\nvariables = pd.concat([nominal,continious,ordinal]).reset_index(drop=True)\nvariables.head(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
NameLabelCountry_specificScale_typeTypeFormatValidInvalidQuestionGroup
0cntryCountrynonominaldiscretecharacter-24438705 CountryGroup Country
1gndrGendernonominaldiscretenumeric-1.0443789F21 CODE SEX, respondentGroup Gender, Year of birth and Household grid
2nwspolNews about politics and current affairs, watch...nocontinuouscontinuousnumeric-4.043863524A1 On a typical day, about how much time do yo...Group Media and social trust
3netustmInternet use, how much time on typical day, in...nocontinuouscontinuousnumeric-4.03011314274A3 On a typical day, about how much time do yo...Group Media and social trust
4ageaAge of respondent, calculatednocontinuouscontinuousnumeric-4.044232155F31b Age of respondent, calculatedGroup Gender, Year of birth and Household grid
5eduyrsYears of full-time education completednocontinuouscontinuousnumeric-2.043963424F16 About how many years of education have you...Group Socio-demographics
6netusoftInternet use, how oftennoordinaldiscretenumeric-1.04433849A2 People can use the internet on different de...Group Media and social trust
7ppltrstMost people can be trusted or you can\'t be too...noordinaldiscretenumeric-2.044272115A4 Using this card, generally speaking, would ...Group Media and social trust
8pplfairMost people try to take advantage of you, or t...noordinaldiscretenumeric-2.044072315A5 Using this card, do you think that most peo...Group Media and social trust
9pplhlpMost of the time people helpful or mostly look...noordinaldiscretenumeric-2.044211176A6 Would you say that most of the time people ...Group Media and social trust
\n
\n\n\n\nNow we\'ve got all the variables we need. The `Format` column contains the values `numeric-1.0`,`numeric-2.0`, and `numeric-4.0` which tells us how many digits the variable\'s value can have at most. This will be useful for removing invalid answers as they should be encoded as values >= 6, 66, or 6666, respectively. We should be fine with removing everything that is above 5 for the small ordinal scales and above 10 for the large ordinal scales. An exception are the questions in the group "Human Values", which include 6 as a valid answer in a 1 to 6 scale and start their invalid answers >= 7. You can see from this example that real world data can be really messy and that it is important to pay attention to the details.\n\n\n```python\n# get the variable names grouped by the way they have encoded invalid answers\nhumval = variables.query("Group == \\"Group Human values\\"").Name\nnum1 = variables.query("Format == \\"numeric-1.0\\" & Group != \\"Group Human values\\" ").Name\nnum2 = variables.query("Format == \\"numeric-2.0\\" & Name != \\"eduyrs\\" ").Name\nedy = ["eduyrs"]\nnum4 = variables.query("Format == \\"numeric-4.0\\" ").Name\n```\n\nLet\'s now select the variables we chose from the whole data set.\n\n\n```python\n# drop the 9 people who are neither male nor female\ndf = df[df.gndr<=2]\n\n# select all the variables that we chose above\ness = df[variables.Name]\ness.sample(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cntrygndrnwspolnetustmageaeduyrsnetusoftppltrstpplfairpplhlp...iphlpplipsucesipstrgvipadvntipbhprpiprspotiplylfrimpenvimptradimpfun
15779FI23012047145223...2424442222
20239GB253037145373...1425341315
1012AT21060059195474...2222431242
30009IT210153042115158...2314332224
20625HU26018058125735...3332333332
449AT130666651121677...1313111113
39639RU118072063125223...2215221221
32707LT13012027165776...4554434334
40951RU260666661141223...2325232235
31591LT26048041165566...2122322133
\n

10 rows \xd7 156 columns

\n
\n\n\n\n\n```python\n# check for missing values \nprint(f"Missing values: \\n{ess.isna().sum()[ess.isna().sum() > 0]}")\n```\n\n Missing values: \n rfgfrpc 1614\n eusclbf 8931\n eudcnbf 8931\n lknemny 2010\n dtype: int64\n \n\nLook\'s great. We already have some missing values present in the variables:\n- `rfgfrpc`: Most refugee applicants not in real fear of persecution own countries\n- `eusclbf`: Against or In favour of European Union-wide social benefit scheme\n- `eudcnbf`: More decisions made by EU: level of benefits in \\[country\\] become higher or lower\n- `lknemny`: How likely not enough money for household necessities next 12 months\n\nWe will now add many more missing values by replacing the invalid response encodings with NaNs. These will later, in visualization or models, either be ignored/dropped or imputed.\n\n\n```python\n# replace invalid responses (discussed above) with NaN\nfor group, cutoff in zip([humval,num1,num2,edy,num4],[7,6,11,66,6666]):\n for var in group:\n ess.loc[:,var].where(ess[var] 0]}") \n```\n\n Missing values after removing invalid answers:\n nwspol 524\n netustm 14272\n eduyrs 421\n netusoft 49\n ppltrst 115\n pplfair 315\n pplhlp 176\n polintr 97\n psppsgva 956\n actrolga 933\n psppipla 840\n cptppola 997\n trstprl 873\n trstlgl 849\n trstplc 320\n trstplt 645\n trstprt 855\n trstep 3459\n trstun 3457\n prtdgcl 25315\n lrscale 5803\n stflife 186\n stfeco 885\n stfgov 1160\n stfdem 1488\n stfedu 1568\n stfhlth 316\n gincdif 670\n mnrgtjb 498\n freehms 1201\n ... \n lkuemp 10284\n lknemny 3773\n hhmmb 130\n domicil 47\n estsz 4940\n wkdcorga 3881\n iorgact 4086\n hinctnta 7938\n hincfel 519\n ipcrtiv 841\n imprich 787\n ipeqopt 814\n ipshabt 890\n impsafe 740\n impdiff 819\n ipfrule 1015\n ipudrst 876\n ipmodst 852\n ipgdtim 810\n impfree 790\n iphlppl 795\n ipsuces 922\n ipstrgv 996\n ipadvnt 833\n ipbhprp 870\n iprspot 996\n iplylfr 771\n impenv 753\n imptrad 748\n impfun 818\n Length: 153, dtype: int64\n \n\nGreat, looks like we have replaced all invalid answers with NaN. Now let\'s reverse the scales that have to be reversed. I have prepared a list with the corresponding variables. If you want to check if you\'d agree with my decision to reverse them, please look them up in the survey\'s documentation. There also is a variable named `lrscale` which represents the placement on the political left-right scale where 0 means very left and 10 means very right. It doesn\'t have to be reversed since the corresponding question is just "In politics people sometimes talk of \'left\' and \'right\'. \\[...\\] Where would you place yourself on this scale \\[...\\]?". I just want mention it here in case someone wonders.\n\n\n```python\n# list of variable for which the scale seems reversed with \n# respect to how the according question wa phrased\nreverse = ["ipcrtiv", "imprich", "ipeqopt", "ipshabt", "impsafe", "impdiff", "ipfrule", \n "ipudrst", "ipmodst", "ipgdtim", "impfree", "iphlppl", "ipsuces", "ipstrgv", \n "ipadvnt", "ipbhprp", "iprspot", "iplylfr", "impenv", "imptrad", "impfun", \n "prtdgcl", "gincdif", "mnrgtjb", "freehms", "hmsfmlsh", "hmsacld", "imsmetn", \n "imdfetn", "impcntr", "aesfdrk", "health", "hlthhmp", "rlgatnd", "pray", \n "gvrfgap", "rfgfrpc", "rfgbfml", "elgcoal", "elgngas", "elghydr", "elgnuc", \n "elgsun", "elgwind", "elgbio", "clmchng", "ccgdbd", "inctxff", "sbsrnen", \n "banhhap", "dfincac", "smdfslv", "sbstrec", "sbprvpv", "sbeqsoc", "sbbsntx", \n "sblazy", "sblwcoa", "imsclbn","uentrjb", "lbenent", "bennent", "eudcnbf",\n "domicil", "hincfel"]\n\n# reverse the ranges\nfor var in reverse:\n upp = int(ess[var].max())\n low = int(ess[var].min())\n ess[var].replace(dict(zip(range(low,upp+1),range(upp,low-1,-1))),inplace=True)\n```\n\nGreat! So, in case that whatever we do next can handle NaNs, we are done with the wrangling for now!\n\n\n```python\ness.sample(10)\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cntrygndrnwspolnetustmageaeduyrsnetusoftppltrstpplfairpplhlp...iphlpplipsucesipstrgvipadvntipbhprpiprspotiplylfrimpenvimptradimpfun
24132IE230.0180.06116.05.06.06.07.0...6.03.06.03.04.05.06.06.04.03.0
12352EE290.0NaN6113.01.07.07.07.0...6.02.05.01.05.01.06.06.04.03.0
37277PL260.0NaN7511.01.05.05.01.0...5.04.05.02.05.05.05.05.05.02.0
13840ES130.0NaN549.02.05.05.07.0...6.05.06.04.06.01.05.06.06.05.0
11982EE10.0240.02212.05.02.03.05.0...3.02.02.03.04.02.04.03.02.02.0
13187ES260.0NaN927.01.08.08.04.0...6.04.06.03.06.05.05.04.06.02.0
23012IE190.0NaN768.01.05.05.010.0...6.05.03.06.05.05.06.06.04.06.0
15216FI2150.0NaN807.01.09.07.08.0...5.05.04.05.05.04.06.06.05.06.0
28764IT215.0NaN855.01.03.04.05.0...5.04.05.05.05.06.05.05.05.03.0
19783GB1150.0240.05813.05.04.06.06.0...5.02.03.04.03.01.04.06.04.04.0
\n

10 rows \xd7 156 columns

\n
\n\n\n\n### Exploration and visualization \nNow that we got the data cleaned, we want to find out what story it tells. We want to see what people answered to all these questions, how these answers differ by country or gender, and how the answers to some questions correlate with the answers to other questions. Which country is the happiest? Who feels the most trust in politics, feels most responsible for climate change, and do women and men have different opinions on it? \n\nThe best way to explore such questions is to visualize them in an interactive way so that we can click around, select different variables, and graphically see how the responses differ between the countries. I\'ve written a `ess_plot()` function below, which creates such an interactive visualization using Python\'s Bokeh library. Feel free to read all the code, but if you find it too boring just click [here](#next) to jump to the next cell. The interactive elements are only going to work in a Jupyter Notebook and not if you read this on a blog post on my website; that\'s no problem though, I\'ll describe a couple of different variable configurations.\n\n### Creating the visualization \n\n\n```python\ndef ess_plot(var1="health",var2="happy"):\n """\n This function creates an interactive graphic visualizing insights from th ESS8.\n It consists of three sub-plots:\n - A line graph that shows the median of one variable as a function of (discrete bins) of another variable.\n This chart can be filtered by country.\n - A bar chart that shows the mean of the first variable for each country\n - A bar chart that shows the mean of the second variable for each country\n Hovering over the graphs will display values in detail.\n \n Parameters\n ----------\n var1, var2 : string\n Names of the two variables to compare initially. \n (variables can be changed interactively in the plot)\n\n Returns\n -------\n No returns\n """\n import math\n from bokeh.plotting import figure \n from bokeh.io import output_notebook, show, push_notebook\n output_notebook()\n from bokeh.models import Band\n from bokeh.models import Range1d\n from bokeh.models import FactorRange\n from bokeh.models import HoverTool\n from bokeh.models import ColumnDataSource\n from bokeh.layouts import row\n from ipywidgets import interact\n \n # width and height for subplots\n p_width ,p_height = 320, 420\n \n # X and Y variable values\n x = var1#"health"\n y = var2#"happy"\n # country filter variable and dictionary \n cntry = "All countries"\n cntry_dict = dict(zip(\n ["All countries","Austria","Belgium","Switzerland","Czech republic","Germany","Estonia","Spain",\n "Finland","France","United Kingdom","Hungary","Ireland","Israel","Iceland","Italy","Lithuania",\n "Netherlands","Norway","Poland","Portugal","Russia","Sweden","Slovenia"],\n ["All countries",\'AT\', \'BE\', \'CH\', \'CZ\', \'DE\', \'EE\', \'ES\', \'FI\', \'FR\', \'GB\', \'HU\',\'IE\', \'IL\', \n \'IS\', \'IT\', \'LT\', \'NL\', \'NO\', \'PL\', \'PT\', \'RU\', \'SE\',\'SI\']))\n \n # boolean False for first setup, later True when plot is updated\n setup = False\n # h is a notebook handle which will be inistialized later and used for updating the plot\n global h\n h = None\n \n def calc_median_iqr():\n """Calculates medians and quartiles for plotting"""\n nonlocal x\n nonlocal y\n nonlocal cntry\n \n # get a copy of the variable columns we want to look at\n if cntry == "All countries":\n ess_c = ess[[x,y]].copy()\n else:\n c = cntry_dict[cntry]\n ess_c = ess.query("cntry == @c")[[x,y]].copy()\n \n # get the y-range\n yrange = (ess_c[y].min(),ess_c[y].max())\n \n # jitter the y-values\n ess_c[y] = ess_c[y] + np.random.uniform(-0.5,0.5,len(ess_c[y]))\n \n # remove NaNs from x-values because bokeh apparently has a problem converting them to JSON\n xs = sorted([n for n in ess_c[x].unique() if not math.isnan(n)])\n \n # get x-range\n xrange = (min(xs),max(xs))\n \n # calculate the median, first, and third quartile of the y-values for each x-value\n medians = [ess_c.loc[ess_c[x]==i,y].median() for i in xs]\n Q3 = [ess_c.loc[ess_c[x]==i,y].quantile(0.75) for i in xs]\n Q1 = [ess_c.loc[ess_c[x]==i,y].quantile(0.25) for i in xs]\n \n return yrange, xrange, xs, medians, Q1, Q3\n \n # # # #\n # Set up the different elements of the plot\n # # # #\n \n # # # # Plot 1: Line plot X vs. Y\n \n # calculate median, Q1, Q3 etc.\n yrange, xrange, xs, medians, Q1, Q3 = calc_median_iqr()\n \n # set up the data source\n source1 = ColumnDataSource(dict(x = xs,\n medians = medians,\n Q3 = Q3,\n Q1 = Q1,))\n \n # create figure 1 for X vs. Y plot\n p1 = figure(plot_width= p_width, plot_height=p_height,\n title=cntry,y_range=yrange,x_range=xrange,\n tools="hover", tooltips="@x -> @medians")\n \n # line plot that shows the median +/- inter-quartile range of one variable as a function of another variable\n p1.line("x", "medians", color="navy",line_width = 3,source=source1,legend="Median\\n+/- IQR")\n \n # plot the inter-quartile range (IQR) as a band\n band = Band(base=\'x\', lower=\'Q1\', upper=\'Q3\', source=source1, \n level=\'underlay\', fill_color="lightblue",fill_alpha=.8, line_width=1, line_color=\'navy\',)\n p1.add_layout(band)\n p1.legend.location = "bottom_left"\n \n # # # # Plot 2: Bar plot for X \n \n def calc_cntry_X():\n """Calculates variable X for all countries"""\n nonlocal x\n \n # get mean and standard deviation of X for each country, also grouped by gender\n gr = ess.groupby("cntry")[x].agg(["mean","std"])\n mean = gr["mean"]\n std = gr["std"]\n gr = ess.groupby(["gndr","cntry"])[x].agg(["mean","std"])\n mean_f = gr.loc[2,"mean"]\n std_f = gr.loc[2,"std"]\n mean_m = gr.loc[1,"mean"]\n std_m = gr.loc[1,"std"]\n \n # get list of coutries without "all countries"\n cntry_list = list(cntry_dict.keys())[1:]\n \n # sort so that the highest mean value is in front\n zipped = zip(mean, std, mean_f, std_f, mean_m, std_m, cntry_list)\n zipped = sorted(zipped,reverse=True)\n mean, std, mean_f, std_f, mean_m, std_m, cntry_list = zip(*zipped)\n \n return mean, std, mean_f, std_f, mean_m, std_m, cntry_list\n \n # calc means and stds\n xmean, xstd, xmean_f, xstd_f, xmean_m, xstd_m, xcntry_list = calc_cntry_X()\n \n # set up the data source 2\n source2 = ColumnDataSource(dict( cntry = xcntry_list,\n mean = xmean,\n std = xstd,\n std_h = [m+s for m,s in zip(xmean,xstd)],\n std_l = [m-s for m,s in zip(xmean,xstd)],\n mean_f = xmean_f,\n std_f = xstd_f,\n mean_m = xmean_m,\n std_m = xstd_m,))\n \n # second plot: bar chart for all countries \n p2 = figure(plot_width=p_width , plot_height=p_height,\n title=variables.query("Name == @x").Label.values[0],\n x_range = xcntry_list,\n y_range = (0,max(xmean)+max(xstd)+0.5),\n tools="")\n \n # bar plot that shows the variable for each country\n bars2 = p2.vbar(x=\'cntry\', top=\'mean\', width=0.8,source=source2,legend="Mean +/- Std.Dev.",\n fill_color=\'lightblue\', fill_alpha=0.8, line_color="navy", \n hover_fill_color="orange", hover_alpha= 0.8, hover_line_color="darkred")\n \n # add lines to represent the spread given by the standard deviation\n p2.segment(x0=\'cntry\', y0="std_l", x1=\'cntry\', y1="std_h", line_width=2, color="black",source=source2)\n \n # set up the hover tool\n p2.add_tools(HoverTool(tooltips=[("Country", "@cntry"),\n ("All", "@mean{0.2f} +/- @std{0.2f}"),\n ("Female", "@mean_f{0.2f} +/- @std_f{0.2f}"),\n ("Male", "@mean_m{0.2f} +/- @std_m{0.2f}")],\n renderers=[bars2]))\n \n # adjust legend location and rotate country labels\n p2.legend.location = "top_right"\n p2.xaxis.major_label_orientation = math.pi/2\n \n # # # # Plot 3: Bar plot for Y\n \n def calc_cntry_Y():\n """Calculates variable Y for all countries"""\n nonlocal y\n \n # get mean and standard deviation of X for each country, also grouped by gender\n gr = ess.groupby("cntry")[y].agg(["mean","std"])\n mean = gr["mean"]\n std = gr["std"]\n gr = ess.groupby(["gndr","cntry"])[y].agg(["mean","std"])\n mean_f = gr.loc[2,"mean"]\n std_f = gr.loc[2,"std"]\n mean_m = gr.loc[1,"mean"]\n std_m = gr.loc[1,"std"]\n \n # get list of coutries without "all countries"\n cntry_list = list(cntry_dict.keys())[1:]\n \n # sort so that the highest mean value is in front\n zipped = zip(mean, std, mean_f, std_f, mean_m, std_m, cntry_list)\n zipped = sorted(zipped,reverse=True)\n mean, std, mean_f, std_f, mean_m, std_m, cntry_list = zip(*zipped)\n \n return mean, std, mean_f, std_f, mean_m, std_m, cntry_list\n \n # calc means and stds\n ymean, ystd, ymean_f, ystd_f, ymean_m, ystd_m, ycntry_list = calc_cntry_Y()\n \n # set up the data source 3\n source3 = ColumnDataSource(dict( cntry = ycntry_list,\n mean = ymean,\n std = ystd,\n std_h = [m+s for m,s in zip(ymean,ystd)],\n std_l = [m-s for m,s in zip(ymean,ystd)],\n mean_f = ymean_f,\n std_f = ystd_f,\n mean_m = ymean_m,\n std_m = ystd_m,))\n \n # third plot: bar chart for all countries \n p3 = figure(plot_width=p_width , plot_height=p_height,\n title=variables.query("Name == @y").Label.values[0],\n x_range = ycntry_list,\n y_range = (0,max(ymean)+max(ystd)+0.5),\n tools="")\n \n # bar plot that shows the variable for each country\n bars3 = p3.vbar(x=\'cntry\', top=\'mean\', width=0.8,source=source3,legend="Mean +/- Std.Dev.",\n fill_color=\'lightblue\', fill_alpha=0.8, line_color="navy", \n hover_fill_color="orange", hover_alpha= 0.8, hover_line_color="darkred")\n \n # add lines to represent the spread given by the standard deviation\n p3.segment(x0=\'cntry\', y0="std_l", x1=\'cntry\', y1="std_h", line_width=2, color="black",source=source3)\n \n # set up the hover tool\n p3.add_tools(HoverTool(tooltips=[("Country", "@cntry"),\n ("All", "@mean{0.2f} +/- @std{0.2f}"),\n ("Female", "@mean_f{0.2f} +/- @std_f{0.2f}"),\n ("Male", "@mean_m{0.2f} +/- @std_m{0.2f}")],\n renderers=[bars3]))\n \n # adjust legend location and rotate country labels\n p3.legend.location = "top_right"\n p3.xaxis.major_label_orientation = math.pi/2\n \n def plot_styling(plots):\n """Styles the plot in seaborn-like style"""\n \n # various commands for styling the plot, I\'m trying to give it the "seaborn" look which I like a lot\n for p in plots:\n p.background_fill_color="lightgray"\n p.background_fill_alpha=0.8\n p.axis.axis_line_color ="white"\n p.axis.minor_tick_line_color ="white"\n p.axis.major_tick_line_color ="white"\n p.legend.background_fill_color = "lightgray"\n p.legend.background_fill_alpha = 0.6\n p.legend.border_line_color="navy"\n p.grid.grid_line_color = "white"\n p.grid.grid_line_alpha = 0.8\n p.axis.major_label_text_font_size = "10pt"\n p.toolbar_location = None\n p.min_border_right = 10\n \n # update functions for dropdown variable selecters\n def updateX(VariableX): \n nonlocal x \n nonlocal y \n nonlocal setup\n new = variables.query("Label == @VariableX").Name.values[0]\n if new != y:\n x = new\n if setup:\n update_plot()\n \n def updateY(VariableY): \n nonlocal x \n nonlocal y \n nonlocal setup\n new = variables.query("Label == @VariableY").Name.values[0]\n if new != x:\n y = new\n if setup:\n update_plot()\n \n def updateCntry(Country): \n nonlocal cntry\n nonlocal setup\n cntry = Country\n if setup:\n update_plot()\n \n # the main updating function \n def update_plot():\n """The main function that creates and updates the plot elements"""\n nonlocal x\n nonlocal y\n nonlocal cntry\n \n # # # # # Updates for Plot 1\n \n # calculate median, Q1, Q3 etc.\n yrange, xrange, xs, medians, Q1, Q3 = calc_median_iqr()\n \n # update the data source\n source1.data = dict(x = xs,\n medians = medians,\n Q3 = Q3,\n Q1 = Q1,)\n \n # update axis names and ranges\n p1.xaxis.axis_label = variables.query("Name == @x").Label.values[0]\n p1.yaxis.axis_label = variables.query("Name == @y").Label.values[0]\n p1.x_range.start = min(xs)\n p1.x_range.end = max(xs)\n p1.y_range.start = min(yrange)\n p1.y_range.end = max(yrange)\n p1.title.text = cntry\n \n # # # # # Updates for Plot 2\n \n # calc updated means and stds\n xmean, xstd, xmean_f, xstd_f, xmean_m, xstd_m, xcntry_list = calc_cntry_X()\n \n # update the data source 2\n source2.data = dict(cntry = xcntry_list,\n mean = xmean,\n std = xstd,\n std_h = [m+s for m,s in zip(xmean,xstd)],\n std_l = [m-s for m,s in zip(xmean,xstd)],\n mean_f = xmean_f,\n std_f = xstd_f,\n mean_m = xmean_m,\n std_m = xstd_m,)\n \n # update range and title\n p2.x_range.factors = xcntry_list\n p2.y_range.end = max(xmean)+max(xstd)+0.5\n p2.title.text = variables.query("Name == @x").Label.values[0]\n \n # # # # # Updates for Plot 3\n \n # calc updated means and stds\n ymean, ystd, ymean_f, ystd_f, ymean_m, ystd_m, ycntry_list = calc_cntry_Y()\n \n # update the data source 2\n source3.data = dict(cntry = ycntry_list,\n mean = ymean,\n std = ystd,\n std_h = [m+s for m,s in zip(ymean,ystd)],\n std_l = [m-s for m,s in zip(ymean,ystd)],\n mean_f = ymean_f,\n std_f = ystd_f,\n mean_m = ymean_m,\n std_m = ystd_m,)\n \n # update range and title\n p3.x_range.factors = ycntry_list\n p3.y_range.end = max(ymean)+max(ystd)+0.5\n p3.title.text = variables.query("Name == @y").Label.values[0]\n \n # style the plots\n plot_styling([p1,p2,p3])\n \n # if not first setup, update plot with push_notebook\n global h\n if setup:\n push_notebook(handle=h)\n \n \n # set up the interactive dropdown variable and country selecter\n x_default = x\n y_default = y\n x_first = variables.query("Name == @x_default ").Label.values.tolist()\n y_first = variables.query("Name == @y_default ").Label.values.tolist()\n var_x = interact(updateX,VariableX=x_first+list(ordinal.Label.values))\n var_y = interact(updateY,VariableY=y_first+list(ordinal.Label.values))\n var_cntry = interact(updateCntry,Country=cntry_dict.keys())\n \n # build the plot\n update_plot()\n h = show(row(p1,p2,p3),notebook_handle=True)\n setup = True\n\n\n```\n\n### Exploring trends \n Before we dive into the plot\'s message, let me describe its composition. There are the following three sub-plots:\n\n- The first sub-plot is a line graph that shows how one variable behaves as a function of another variable (e.g. how healthy are people vs. how happy are people). The variable on the X-axis stays in its original binned (e.g. integer scale 0 to 10) form. For the variable on the Y-axis, the values are randomly jittered a little bit and the median, first quartile (Q1), and third quartile (Q3), corresponding to each X-value, are calculated. An example: For all the people who answered X with a certain value, take their answers to Y, jitter them a little bit to create a more continuous scale, calculate median, Q1, and Q3. I decided to take the median as a central tendency and the interquartile range (IQR) (instead of mean and standard deviation) because some of the variables\' distributions are quite skewed (meaning they have longer tails on one side), and this can be nicely seen if Q1\'s and Q3\'s absolute distance from the median is unequal. The dropdown menus allow to select which variables are displayed and to filter the sub-plot by country.\n- The second and third sub-plots are bar charts that show the mean values (and the standard deviation as lines around the bar tops) for variables X and Y, respectively, for each country. The countries are sorted according to the mean value of the variable, the highest being on the left. Note that this means that the X-axis for these two plots are generally not the same. You can use your mouse to hover over the bars and display the exact values for mean and standard deviation, also given separately for female and male respondents.\n\nNote that since the answers in the survey were not encoded on identical scales, **the sub-plots do not necessarily have the same Y-axis range**. Another technical note: Once you create the plot a second time, you cannot update the first one anymore because the new one took the notebook hable which bokeh uses to update figures in notebooks.\n\n\n\n```python\n# make our first plot\ness_plot()\n```\n\n![](https://raw.githubusercontent.com/Pascal-Bliem/european-social-survey/master/figures/viz1.png)\n\nAs we can, unsurprisingly, see in the plots above that people who feel healthy are also happier. Try to filter the left plot by country! You can see that e.g. in Hungary very healthy people are a lot happier (almost 4 points) than very unhealthy people. In contrast, in Norway this difference is much smaller (only 2 points). Maybe that can give us a hint that Norway\'s healthcare is quite excellent. If you have a look at the plots on the right, you can see that Israel (very European) feels the healthiest and Switzerland, probably due to chocolate, feels the happiest. If you hover your mouse over the happiness bars you can see that, on average, the women in Finland are a bit happier than the men and that in Italy, it\'s the other way around. \n\nWhen talking about such small differences in means, we should keep in mind that the spread of answers is quite large in comparison and that we did not apply sample stratification weights in this calculations. We will dedicate another section to statistical significance and effect sizes later on. For now, let\'s continue exploring.\n\nLet\'s have a look at how the Europeans think about immigration and sexual orientation based on their answers to the questions "Do immigrants make the country a worse or better place to live?" and "Are you ashamed if a close family member is gay or lesbian?*. In the plot below we can se that people who are less comfortable with homosexuality are also less comfortable with immigrants (e.g. Russia, Hungary) and vice-versa (e.g. Iceland, Sweden).\n\n\n```python\n# make another plot\ness_plot("hmsfmlsh","imwbcnt")\n```\n\n![](https://raw.githubusercontent.com/Pascal-Bliem/european-social-survey/master/figures/viz2.png)\n\nWhat about trust in political international cooperation? Let\'s have a look at the answers to the questions "Do you have trust in the European Parliament?" and "Do you have trust in the United Nations?". In the plots below we can see that people who have a high trust in the European parliament generally also have a high trust in the United Nations. Some countries (e.g. Iceland, Finland) seem to have higher trust in these institutions than other countries (e.g. Israel, Russia). I\'m happy to see that Norway is among the countries which have a high trust in the EU parliament even though they\'re not a member of th EU.\n\n\n```python\n# yet another plot\ness_plot("trstep","trstun")\n```\n\n![](https://raw.githubusercontent.com/Pascal-Bliem/european-social-survey/master/figures/viz3.png)\n\n\nLet\'s look at a last example on how people feel about their participation in politics and their responsibility for climate change based on the questions "Are you confident in your own ability to participate in politics?" and "To what extent do you feel personal responsibility to reduce climate change?*. There is a small trend of people who are more confident in their political abilities also feeling more personal responsibility to reduce climate change. Some countries (e.g. Germany, Switzerland) are among the leaders in both categories.\n\n\n```python\n# one more plot\ness_plot("cptppola","ccrdprs")\n```\n\n![](https://raw.githubusercontent.com/Pascal-Bliem/european-social-survey/master/figures/viz4.png)\n\n\nThose were a few interesting insights from the ESS8 which I picked as examples. Please go ahead and play around with the plots to see what interesting insights you can uncover. In the next section we will consider the statistical significance and effect sizes of what we saw here.\n\n### Hypothesis testing and effect sizes \nIn the exploration above we have seen some interesting looking effects and some apparent differences between countries. Let\'s for example compare the two European countries I\'ve lived in: Germany and Sweden. It looks like they are similarly happy but have different levels of trust in the UN, different opinions on Immigraton and on personal responsibility to fight climate change.\n\nBut how do we know that these differences are actually meaningful and did not just appeared by random chance? And even if we can be confident that an observed difference is probably not just due to random fluctuation, is its magnitude large enough to be meaningful at all? To answer these questions we\'ll dive into the realm of statistical hypothesis testing and the effect sizes we can deduce from such tests.\n\nThe next couple of cells contain a somewhat lengthy theory recap on parametric and non-parametric null hypothesis significance testing, error control, and effect sizes. If you already know all about it, [click here](#tests) to skip it and see the actual tests. If you don\'t care about statistical tests at all, [click here](#ml) to jump straight to the machine learning part of the project.\n\n### Theoretical background \nBefore we start, we have to note that I already **committed a big sin**. I want to test my hypotheses on the same data which I used to generate them! I built the visualization first before planning the rest of the project, that\'s why. But th[](http://)is means that I\'m not strictly separating my exploratory research from my confirmatory research which should always be done if possible. Exploring a data set with somewhat flexible methodology and a lot of comparisons will inflate my false positive rate (type 1 error). Often, apparently significant insights from exploration will [fail to replicate](https://www.theguardian.com/science/2012/mar/15/precognition-studies-curse-failed-replications) in a confirmation study. I think this idea is comparable to a common problem in machine learning: overfitting to the training data and then being surprised that the model generalizes poorly to new data. That\'s why we keep a holdout set (which I\'ll get to later again). Using Richard Fineman\'s words: "The first principle is that you must not fool yourself, and you are the easiest person to fool." If you want to confirm a theory without fooling yourself (too much), a great idea is to pre-register your methodology (e.g. with a scientific journal if you work in academia or with the experimentation team at your company). In the present case, however, our sample size of about 2000 respondents per country is so large that this is hopefully not much of a problem here.\n\nOkay, now back to testing. In this example, we want to compare two samples and see if they are different from each other (alternative hypothesis) or if they might come from the same population and the observed difference might be due to random fluctuations (null hypothesis). If we talk about probabilities in this (frequentist) context, we consider probabilities as long-term frequencies of the data we observe, assuming a hypothesis is true. If we would repeat our sampling process an infinite amount of times, the frequency with which the observed data would appear would correspond to its probability under the given hypothesis. In null hypothesis significance testing, depending on the situation, there are a lot of different test statistics that can be calculated. Based on this statistics one can obtain a probability for how likely the observed data is, assuming the null hypothesis is true. This probability is called the p-value. This is not a magical number that tells you how likely you think it is that any of the hypotheses is true or false (for that you may want to use Bayesian concepts); it simply is the probability of observing the data assuming the null is true. If the p-value is very small, the data would be very surprising if the null was true. \n\nHow surprising does it have to be to be willing to reject the null? That\'s up to us. We (completely subjectively) choose a significance level, $\\alpha$ (often chosen to be 0.05). If p falls under $\\alpha$ we decide to reject the null. Same here, $\\alpha$ is not a magical number that tells us how true or false a hypothesis is. For any give test, a hypothesis is either true or not (probability 1 or 0). Instead, $\\alpha$ is what we chose as the highest possible false positive (type 1 error) rate we are willing to accept. Given e.g. $\\alpha$=5% that means, in the long run, if we were to repeat the experiment an infinite amount of times, we would incorrectly reject the null (make a type 1 error) at most 5% of the time. With $\\alpha$=5%, in the long run, 1 out of every 20 test results might be a false positive. Of course, we can also find false negatives (type 2 error rate, $\\beta$, not finding a significant effect when it\'s actually there). The quantity 1-$\\beta$ is known as statistical power and it represents the probability of finding a significant effect when it is really there. It depends on $\\alpha$, the sample size, and the effect size. If you want to get a more intuitive understanding of error rates, check out [this amazing animation](https://rpsychologist.com/d3/NHST/).\n\nThere is one more aspect we have to consider for properly controlling the false positive rate. As discussed above, the more tests we perform, the higher the chance that one of them will be a false positive (finding a significant result just by chance). That means, if we perform multiple comparisons, we have to lower each test\'s significance level to keep the overall $\\alpha$ at the desired level. The most straightforward approach is probably the [Bonferroni correction](https://en.wikipedia.org/wiki/Bonferroni_correction) which just divides $\\alpha$ by the number of tests for each test. However, it assumes that all of the hypothesis tests are statistically independent which is most likely not the case. That means it will be overly conservative and reduce the tests\' power more than necessary. I think being conservative is not much of a problem here due to the large sample size (and hence, inherently high power), but we could also have chosen to control the false positive rate via the [false discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate).\n\nNow we should figure out which test statistic to use. We want to compare if, and to what extent, two groups (in our example the respondents from Germany and Sweden) are different from each other or if they are, statistically speaking, from the same underlying population. A very common choice would be a two-independent-sample [Student\'s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test) which tests if the means of two groups are significantly different. One assumption of this test is that the two groups have equal variance (homoscedasticity). This could for example be tested by [Levene\'s test](https://en.wikipedia.org/wiki/Levene%27s_test) and in case the assumption is violated, one can perform [Welch\'s](https://en.wikipedia.org/wiki/Welch%27s_t-test) version of the t-test which accounts for unequal variances. Another important assumption is that the sample mean of a group should be normally distributed around the mean (and scaled by the variance) of the underlying population. Let\'s have a look at our data of interest first.\n\n\n```python\n# plot the data we want to perform tests on\nfig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=1,ncols=4,figsize=(16,5))\nfor var, ax, title in zip(["happy","trstun","imwbcnt","ccrdprs"],[ax1, ax2, ax3, ax4],\n ["Happiness","Trust in UN","Is immigration good?","Responsibility climate change"]):\n # get data\n de = ess.loc[ess.cntry=="DE",var]\n se = ess.loc[ess.cntry=="SE",var]\n \n # make plots\n sns.distplot(de,bins=11,kde=True,hist_kws={"alpha":0.3}, kde_kws={"bw":0.5},ax=ax,label="Germany")#color="lightblue"\n sns.distplot(se,bins=11,kde=True,hist_kws={"alpha":0.3}, kde_kws={"bw":0.5},ax=ax,label="Sweden")\n ax.set_xlabel("")\n ax.legend()\n ax.set_title(title)\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/output_37_0.png)\n\n\nWe can see that our raw data from our samples is definitely not distributed normally in most cases. But the t-test works with the sample means, so how are the means distributed? According to the [central limit theorem](https://en.wikipedia.org/wiki/Central_limit_theorem), sample means tend to be normally distributed around the population mean for sufficiently large sample sizes, even if the underlying distribution is not normal. Let\'s try to understand that by randomly sampling 5, 10, 50, and 500 answers from the Germany happiness data 10000 times and looking at the histograms.\n\n\n```python\n# prepare a figure\nfig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=1,ncols=4,figsize=(16,5),sharex=True,sharey=True)\n\n# get the data\nhappy_de = ess.loc[(ess.cntry=="DE")&~(ess[var].isna()),var]\n# determine sample sizes\nsample_size = [5, 10, 50, 100]\n\nfor n, ax in zip(sample_size,[ax1, ax2, ax3, ax4]):\n \n # for each sample size, simulate 10000 random sampels and calculate their means\n means = []\n for i in range(0,10000):\n sample = np.random.choice(happy_de, size=n, replace=True)\n mean = np.mean(sample)\n means.append(mean)\n \n # plot the distribution of sample means for each sample size\n sns.distplot(means,bins=20,kde=True,hist_kws={"density":True,"ec":"lightblue"}, kde_kws={"bw":0.2},ax=ax,)\n ax.set_xlabel("Sample mean")\n ax.set_title(f"Sample size = {n}")\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/output_39_0.png)\n\n\nWe can clearly see in the histograms above that for smaller sample sizes, the skew of the underlying sampling distribution is still visible, but for larger sample sizes, the sample means are practically normally distributed. We should keep in in mind though that the mean is not always the best measure of the central tendency for all kinds of data. Even if the central limit theorem holds, if the underlying distribution of the data is strongly skewed, maybe the median is a much better representation of central tendency and then a non-parametric test would be the way to go.\n\n\n```python\nm = np.mean([ess[ess.cntry==c].shape[0] for c in ess.cntry.unique()])\nprint(f"Countries\' mean sample size: {m:.2f}")\n```\n\n Countries\' mean sample size: 1929.48\n \n\nWe are lucky to have a very large sample size. And due to the fixed answer scales in this survey, we can exclude the presence of strong outliers which could distort the calculated mean. Hence, we can probably savely assume that the normallity assumption is met in our example.\nBut what should we do if this assumption was not met, e.g. if our sample size was too small or the sample distribution exhibits outliers, skewness, or heavy-tails? This is often overlooked and I want to quote a great [blog post](https://garstats.wordpress.com/2016/05/02/robust-effect-sizes-for-2-independent-groups/) here, saying "When I was an undergrad, I was told that beyond a certain sample size (n=30 if I recall correctly), t-tests and ANOVAs are fine. **This was a lie.**"\n\nThe are more robust alternatives called non-parametric rank-based test. They\'re called like that because they do not make assumptions of the shape of the underlying distribution and instead compare the ranks of different groups. A non-parametric alternative to the two sample t-test is the [Mann-Whitney U test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), which tests if one group is stochastically greater than the other. It does so by ranking the data from both groups and calculating how often any obervation from one group is higher in rank than any obervation from the other group. This ranksum is the U statistic, which is approximately normally distributed and, hence, can be assigned a z-score and a corresponding p-value. For assuming this distribution of U under the null, the test does also assume equal variance of the samples\' underlying populations. Have a look [here](http://www.statisticslectures.com/topics/mannwhitneyu/) for an example.\n\nLet me show you that the Mann-Whitney U test will yield approximatley the same result as a t-test on ranked normallly distributed data.\n\n\n```python\nfrom scipy.stats import ttest_ind, mannwhitneyu, rankdata\n\n# make some random data\na = np.random.normal(0.0, 1.0, size=1000)\nb = np.random.normal(0.1, 1.0, size=1000)\n# perform a Mann-Whitney U test, the continuity correction should only be \n# applied for non-continoius data to handle ties\nU, p_mw = mannwhitneyu(a,b,use_continuity=False,alternative="two-sided")\n\n#rank data\nranked = rankdata(np.append(a,b),method="average")\nranked_a = ranked[:len(a)]\nranked_b = ranked[len(a):]\n# perform a t-test on ranks\nt, p_t = ttest_ind(ranked_a,ranked_b,equal_var=True)\n\nprint(f"Mann-Whitney U test p-value: {p_mw}\\nt-test p-value: {p_t}")\n```\n\n Mann-Whitney U test p-value: 0.17763236270223115\n t-test p-value: 0.1776965037746623\n \n\nI want to use both the t-test and the Mann-Whitney U test her to compare their results. The p-value will help us in deciding if we want to consider a difference between the groups as statistically significant or not but it does not tell us how large the difference is. Since the answers in the survey don\'t have a common unit, it makes sense to look at standardized effect sizes. \n\nA standardized difference in means can be expressed by [Cohen\'s $d$](https://en.wikipedia.org/wiki/Effect_size#Cohen\'s_d), the difference in group means divided by the pooled standard deviation. It assumes equality of variance as well, and can be positively biased for small samples which shouldn\'t be a problem here. It has a less biased cousin called [Hedge\'s $g$](https://en.wikipedia.org/wiki/Effect_size#Hedges\'_g) which is, however, not used as commonly. I will use $d$ here because there are well accepted descriptors for magnitudes: 0.2 is considered small, 0.5 medium, and 0.8 large of an effect. Note that these are just conventional benchmarks and that a different interpretation may make sense in some cases.\n\nAn effect size coming directly out of the Mann-Whitney U test is called the [common language effect size](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Common_language_effect_size), $f = \\frac{U}{nm}$, where n and m are the sizes of the two groups. It basically tells you, out of all pairs between both groups, the proportion of pairs that support one group being stochastically greater than the other. A value of 0.5 means the groups are stochastically equal and a value of 1.0 means one group is fully greater than the other. This is actually the same metric as the area under the receiver operating curve (AUC-ROC), which is commonly used to evaluate machine learning models. We can also define a rank biserial correlation as $r = 1 - \\frac{2U}{n m}$. If we\'d treat this correlation like a Pearson\'s correlation, we could transform it into a Cohen\'s $d$ as $d = \\frac{2r}{\\sqrt{1-r^{2}}}$. Please note that I\'m not really confident if that last move can actually be done with a rank biserial correlation and I just want to use it for comparison here - don\'t quote me on this.\n\n\n\n```python\n# I need to quickly modify Scipy.stats\' mannwhitneyu function so that it returns \n# the U statistic for both groups, not just the smaller one\n\nfrom scipy.stats import tiecorrect, distributions\ndef mannwhitneyu(x, y, use_continuity=True, alternative=None):\n """\n EDIT: My modified version that returns the U statistic for both groups, \n not just the smaller one\n \n Compute the Mann-Whitney rank test on samples x and y.\n Parameters\n ----------\n x, y : array_like\n Array of samples, should be one-dimensional.\n use_continuity : bool, optional\n Whether a continuity correction (1/2.) should be taken into\n account. Default is True.\n alternative : None (deprecated), \'less\', \'two-sided\', or \'greater\'\n Whether to get the p-value for the one-sided hypothesis (\'less\'\n or \'greater\') or for the two-sided hypothesis (\'two-sided\').\n Defaults to None, which results in a p-value half the size of\n the \'two-sided\' p-value and a different U statistic. The\n default behavior is not the same as using \'less\' or \'greater\':\n it only exists for backward compatibility and is deprecated.\n Returns\n -------\n statistic : float\n The Mann-Whitney U statistic, equal to min(U for x, U for y) if\n `alternative` is equal to None (deprecated; exists for backward\n compatibility), and U for y otherwise.\n \n U1: The Mann-Whitney U statistic for x\n \n U2: The Mann-Whitney U statistic for y\n \n pvalue : float\n p-value assuming an asymptotic normal distribution. One-sided or\n two-sided, depending on the choice of `alternative`.\n Notes\n -----\n Use only when the number of observation in each sample is > 20 and\n you have 2 independent samples of ranks. Mann-Whitney U is\n significant if the u-obtained is LESS THAN or equal to the critical\n value of U.\n This test corrects for ties and by default uses a continuity correction.\n References\n ----------\n .. [1] https://en.wikipedia.org/wiki/Mann-Whitney_U_test\n .. [2] H.B. Mann and D.R. Whitney, "On a Test of Whether one of Two Random\n Variables is Stochastically Larger than the Other," The Annals of\n Mathematical Statistics, vol. 18, no. 1, pp. 50-60, 1947.\n """\n if alternative is None:\n warnings.warn("Calling `mannwhitneyu` without specifying "\n "`alternative` is deprecated.", DeprecationWarning)\n\n x = np.asarray(x)\n y = np.asarray(y)\n n1 = len(x)\n n2 = len(y)\n ranked = rankdata(np.concatenate((x, y)))\n rankx = ranked[0:n1] # get the x-ranks\n u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0) # calc U for x\n u2 = n1*n2 - u1 # remainder is U for y\n T = tiecorrect(ranked)\n if T == 0:\n raise ValueError(\'All numbers are identical in mannwhitneyu\')\n sd = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)\n\n meanrank = n1*n2/2.0 + 0.5 * use_continuity\n if alternative is None or alternative == \'two-sided\':\n bigu = max(u1, u2)\n elif alternative == \'less\':\n bigu = u1\n elif alternative == \'greater\':\n bigu = u2\n else:\n raise ValueError("alternative should be None, \'less\', \'greater\' "\n "or \'two-sided\'")\n\n z = (bigu - meanrank) / sd\n if alternative is None:\n # This behavior, equal to half the size of the two-sided\n # p-value, is deprecated.\n p = distributions.norm.sf(abs(z))\n elif alternative == \'two-sided\':\n p = 2 * distributions.norm.sf(abs(z))\n else:\n p = distributions.norm.sf(z)\n\n u = u2\n # This behavior is deprecated.\n if alternative is None:\n u = min(u1, u2)\n \n return u, u1, u2, p\n\n```\n\n### Performing test \n\nNow let\'s finally perform the tests. We start with a Leven\'s test for equal variances on the previous mentioned examples. If the p-values falls below our chosen significance level of 0.05, we will assume that the variances of the samples are unequal.\n\n\n```python\nfrom scipy.stats import levene\n# perform Leven\'s test on the samples of interest\nfor var in ["happy","trstun","imwbcnt","ccrdprs"]:\n a = ess.loc[(ess.cntry=="DE")&~(ess[var].isna()),var]\n b = ess.loc[(ess.cntry=="SE")&~(ess[var].isna()),var]\n stat, p = levene(a,b)\n print(f"p-value for Leven\'s test on variable {var}: {p}")\n```\n\n p-value for Leven\'s test on variable happy: 0.04897068764262911\n p-value for Leven\'s test on variable trstun: 2.0311968228602714e-08\n p-value for Leven\'s test on variable imwbcnt: 0.006296412299982697\n p-value for Leven\'s test on variable ccrdprs: 0.10995168268972666\n \n\nFor all but the last test, the p-value is lower than 0.05 so we should generally consider inequality of variances. For the t-test that means that we\'ll use Welch\'s version instead of the Student\'s version. For the Mann-Whitney U test that unfortunately means that an assumption is violated in a way we cannot fix.\n\n\n```python\n# implement the function for the tests\ndef stats_tests(country_a="Germany", country_b="Sweden", \n var_list=["happy","trstun","imwbcnt","ccrdprs"],\n alpha=0.05):\n """\n Performs Welch\'s t-test and Mann-Whitney U test on two countrie\'s \n answers to the questions corresponding to the supplied variables.\n \n Parameters\n ----------\n country_a, country_b : string,\n Names of the countries to compare\n var_list : list of strings\n List with variable names to compare\n alpha : float in range [0,1]\n Significance level alpha\n \n Returns\n -------\n None, just prints the results\n """\n \n cntry_dict = dict(zip(\n ["Austria","Belgium","Switzerland","Czech republic","Germany",\n "Estonia","Spain","Finland","France","United Kingdom","Hungary",\n "Ireland","Israel","Iceland","Italy","Lithuania","Netherlands",\n "Norway","Poland","Portugal","Russia","Sweden","Slovenia"],\n [\'AT\', \'BE\', \'CH\', \'CZ\', \'DE\', \'EE\', \'ES\', \'FI\', \'FR\', \'GB\', \n \'HU\',\'IE\', \'IL\', \'IS\', \'IT\', \'LT\', \'NL\', \'NO\', \'PL\', \'PT\', \n \'RU\', \'SE\',\'SI\']))\n \n # adjust significance level alpha with the Bonferoni correction\n alpha_cor = alpha / len(var_list)\n \n # for each comparison\n for var in var_list:\n # get data (without NaNs)\n a = ess.loc[(ess.cntry==cntry_dict[country_a])&~(ess[var].isna()),var]\n b = ess.loc[(ess.cntry==cntry_dict[country_b])&~(ess[var].isna()),var]\n\n # sample sizes\n n = len(de)\n m = len(se)\n \n # perform Welch\'s version of the t-test to account for possibly unequal variances\n t, p_t = ttest_ind(a, b, equal_var=False)\n # calculate Cohen\'s d from t statistic\n d_t = t * np.sqrt(((n+m)/(n*m))*((n+m)/(n+m-2)))\n \n # We could also calculate d directly but this way there would be no correction \n # for unequal variance which I hope is accounted for through Welch\'s t-test. \n # The differences are negligible anyway, usually in the order of +/- 0.005.\n # s = np.sqrt(((n-1)*np.var(a)+(m-1)*np.var(b))/(n+m))\n # d = (np.mean(a)-np.mean(b))/s\n \n # perform Mann-Whitney U test with continuety corrertion\n U, U1, U2, p_mw = mannwhitneyu(a, b, use_continuity=False,alternative="two-sided")\n # calculate common language effect size\n f = U1/(n*m)\n # calculate rank biserial correlation\n r = 1- (2*U1/(n*m))\n # convert r to d (don\'t do this step for your own research without consulting a statistician)\n d_mw = 2*r/np.sqrt(1-r**2)\n \n # print the test results\n print("#----------------------------------------------------------------#")\n question = variables.query("Name == @var").Label.values[0]\n print(f"Question: {question}")\n print(f"Tests for {country_a} vs. {country_b} at significance level alpha={alpha}\\n\\\nBonferroni-corrected alpha: {alpha_cor}\\n")\n print(f"Results of two-independent-sample Welch\'s t-test:\\nd: {d_t}\\np-value: {p_t}")\n if p_t < alpha_cor:\n print("The effect IS statistically significant.\\n")\n else:\n print("The effect IS NOT statistically significant.\\n")\n print(f"Mann-Whitney U test:\\nf: {f}\\nd: {d_mw}\\np-value: {p_mw}")\n if p_mw < alpha_cor:\n print("The effect IS statistically significant.\\n")\n else:\n print("The effect IS NOT statistically significant.\\n")\n print("#----------------------------------------------------------------#")\n\n```\n\n\n```python\n# let\'s perform tests on the answers of German and Swedish respondents\nstats_tests(country_a="Germany",country_b="Sweden", var_list=["happy","trstun","imwbcnt","ccrdprs"],alpha=0.05)\n```\n\n #----------------------------------------------------------------#\n Question: How happy are you\n Tests for Germany vs. Sweden at significance level alpha=0.05\n Bonferroni-corrected alpha: 0.0125\n \n Results of two-independent-sample Welch\'s t-test:\n d: -0.055016336450354204\n p-value: 0.0814180779725071\n The effect IS NOT statistically significant.\n \n Mann-Whitney U test:\n f: 0.5054074107587205\n d: -0.021630908051316983\n p-value: 0.33415032425225033\n The effect IS NOT statistically significant.\n \n #----------------------------------------------------------------#\n #----------------------------------------------------------------#\n Question: Trust in the United Nations\n Tests for Germany vs. Sweden at significance level alpha=0.05\n Bonferroni-corrected alpha: 0.0125\n \n Results of two-independent-sample Welch\'s t-test:\n d: -0.6035233736514466\n p-value: 1.508529807853895e-77\n The effect IS statistically significant.\n \n Mann-Whitney U test:\n f: 0.6056753608107497\n d: -0.4324708345379035\n p-value: 1.4695150972378358e-71\n The effect IS statistically significant.\n \n #----------------------------------------------------------------#\n #----------------------------------------------------------------#\n Question: Immigrants make country worse or better place to live\n Tests for Germany vs. Sweden at significance level alpha=0.05\n Bonferroni-corrected alpha: 0.0125\n \n Results of two-independent-sample Welch\'s t-test:\n d: -0.47013531037374584\n p-value: 1.4773494884468852e-48\n The effect IS statistically significant.\n \n Mann-Whitney U test:\n f: 0.6136761978012034\n d: -0.46693256299062985\n p-value: 1.8808893248968186e-48\n The effect IS statistically significant.\n \n #----------------------------------------------------------------#\n #----------------------------------------------------------------#\n Question: To what extent feel personal responsibility to reduce climate change\n Tests for Germany vs. Sweden at significance level alpha=0.05\n Bonferroni-corrected alpha: 0.0125\n \n Results of two-independent-sample Welch\'s t-test:\n d: 0.038632456687586435\n p-value: 0.22105438200285313\n The effect IS NOT statistically significant.\n \n Mann-Whitney U test:\n f: 0.4664725376645704\n d: 0.13441237319468877\n p-value: 0.12167398110111964\n The effect IS NOT statistically significant.\n \n #----------------------------------------------------------------#\n \n\nThe two types of tests agree on the matter of significance but for the non-significant results, their calculated p-values can be quite different from each other. This is not too surprising considering that we already know that the Mann-Whitney U test\'s assumption of equal variances is violated and that the two tests are not testing the same null hypothesis (no difference in **means** for the t-test vs. **medians** for the Mann-Whitney U test). Furthermore, parametric tests can have a higher statistical power when the data meets the tests\' assumptions because the test then makes use of the knowledge of the underlying distribution. In the case of Welch\'s t-test, all assumptions seem to be met.\n\nThe estimated effect sizes from both types of tests are not too different, differing not more than 0.1 here. The Germans and Swedes who responded in this survey show no statistically significant difference in their happiness or their personal responsibility to reduce climate change. The Swedes, however, think more than the Germans that immigrants make the country a better place to live ($d$ = -0.47, a small to medium effect) and they have more trust in the United Nations ($d$ = -0.61, a medium effect).\n\nLet\'s try another combination: What do Italians and French think about the question "How important is it to be successful and that people recognize one\'s achievements?"\n\n\n\n```python\nstats_tests(country_a="Italy",country_b="France", var_list=["ipsuces"],alpha=0.05)\n```\n\n #----------------------------------------------------------------#\n Question: Important to be successful and that people recognize achievements\n Tests for Italy vs. France at significance level alpha=0.05\n Bonferroni-corrected alpha: 0.05\n \n Results of two-independent-sample Welch\'s t-test:\n d: 1.4386826625732236\n p-value: 0.0\n The effect IS statistically significant.\n \n Mann-Whitney U test:\n f: 0.21174931004840972\n d: 1.4110979775482093\n p-value: 0.0\n The effect IS statistically significant.\n \n #----------------------------------------------------------------#\n \n\nOh wow! A p-value of zero and $d$ of 1.44, that\'s a huge effect! Looks like the Italian respondents here care a lot more about being successful and recognized by others.\n\nNow please feel free to check out any question and country combination yourself! In the next section we will try to build machine learning models to predict how people did respond to certain questions and try to explain these predictions.\n\n### Predicting answers with machine learning \nWe have seen in the previous sections that people who answer some questions in a certain way also seem likely to answer other questions in a specific way. There seem to be certain types of personalities, attitudes, or opinions which can be observed among the respondents of the survey. Can we make use of these patterns to make a prediction of how people feel about something, based on the answers they have given? \n\nIn this example, I want to see if I can predict how happy respondents rate themselves. This is a regression problem for which machine learning algorithms come in handy. Usually we\'d also like to know what factors are relevant for a prediction. In linear models we can access such information relatively easily, but often linear models are not complex enough to model complex relationships in the data. On the contrary, modern ensemble algorithms are much more powerful predictors but their internal complexity often makes it hard to explain their predictions and they\'re often described as black-box models. Interpretability can be a very important factor though, and EU regulations even demand a [right to explanation](https://ora.ox.ac.uk/catalog/uuid:593169ee-0457-4051-9337-e007064cf67c/download_file?file_format=pdf&safe_filename=euregs.pdf&type_of_work=Journal+article) for algorithmic decision making. To quote another [blogpost](https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d): "If you ask me to swallow a black pill without telling me what\u2019s in it, I certainly don\u2019t want to swallow it. The interpretability of a model is like the label on a drug bottle."\n\nI will compare both approaches here and try to provide explanations for predictions by using [SHAP values](http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf).\n\n### Data preparation \nNow we have to think about how to prepare the data for machine learning. Let\'s think about outliers first. In the data from the x to y scale kind of answers, we won\'t have any outliers. But the four continuous variables, news consumption, internet usage, age, and years of education, could contain outliers. We will detect them visually below.\n\n\n```python\n# detect outliers for the continuous variables\n_ = ess[variables.query("Scale_type == \\"continuous\\"").Name].plot(kind="box")\n```\n\n\n\n\n\nThe outliers for the variables `nwspol` (news consumption), `netustm` (internet usage), and `eduyrs` (years of education) probably make sense and we should not delete them. Some people just like to browse or study a lot and that is important information. But certainly no one is 1000 years old, so we\'ll remove everything above say 100 years of age. We should also remove the entire varible `stflife` corresponding to the question "How satisfied with life as a whole?" because it is almost identical with "How happy are you?" and, therefore, represents a source of [data leakage](https://www.quora.com/Whats-data-leakage-in-data-science). Last but not least, all rows with missing label (happiness value) have to be dropped.\n\n\n```python\n# remove age outlier\ness.agea.where(ess.agea<100,np.NaN,inplace=True)\n\n# remove variable stflife\ness.drop("stflife",axis=1,inplace=True)\n\n# remove rows with missing happiness value\ness = ess[~ess.happy.isna()]\n```\n\nLet\'s now split off a hold-out set from our data. Since not all countries and genders have the same number of respondents (see plot below) we should stratify the split so that both the training and test data contain a similar fraction from each country and gender.\n\n\n```python\n# plot respondents per country\nfig, (ax1, ax2) = plt.subplots(nrows=1,ncols=2,figsize=(10,3))\nsns.barplot(ess.cntry.value_counts().index, ess.cntry.value_counts(),color="navy",alpha=0.4,ax=ax1)\nsns.barplot(ess.gndr.value_counts().index,ess.gndr.value_counts(),color="navy",alpha=0.4,ax=ax2)\nax1.set_xticklabels(ax1.get_xticklabels(),rotation="vertical")\nax1.set_xlabel("country")\nax2.set_xlabel("gender")\nfor ax in [ax1,ax2]:\n ax.set_ylabel("Number of respondents")\nplt.tight_layout()\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/output_60_0.png)\n\n\n\n```python\n# do a stratified split of training and test data\nfrom sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(ess.drop("happy",axis=1), \n ess.happy, \n stratify=ess[["cntry","gndr"]], \n test_size=0.25, random_state=42)\n```\n\nThere are still about 150 variables left. Many of these are probably not strongly correlated with a person\'s happiness. It may make sense to remove uninformative features because it will reduce the computational cost and reduce the dimensionality of the data. The more dimensions are considered, the less densely populated the feature space will be. That means that new data is likely to be further away from learned training examples, which can make prediction harder. This problem is also known as the [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality#Machine_learning).\nWe can calculate Pearson\'s correlation and have a look how many features are at least weakly correlated with happiness:\n\n\n```python\n# calculate correlation\ncor = ess.corr().happy\nhighcor = cor[np.abs(cor)>=0.15].sort_values()\nprint(f"{len(highcor)} out of {len(cor)} features have a correlation with happiness >= 0.15")\n```\n\n 49 out of 154 features have a correlation with happiness >= 0.15\n \n\nRemoving features may not necissarily yield in a large increase in prediction quality but it will simplify the problem and, considering we are also interested in explaining which features are particulary important in predicting happiness, I think it makes sense here. If only dimensionality reduction was of interest, one could also apply unsupervised machine learning methods such as [principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis), but we will not consider this here.\n\nThere are different approaches to feature selection. Some models offer ways (e.g. feature importance from decision trees or coefficients from [Lasso](https://en.wikipedia.org/wiki/Lasso_(statistics)-regularized linear regression) to select important features, but this requires training another model before the actual model. A simpler way is given by univariate statistical tests on the individual features, which do not take any feature interactions into account. This is what we\'ll do here by, say, selcting the 50 highest scoring features.\n\nWe will already perfrom One-Hot encoding on the categorical variables (country and gender) at this point using Panda\'s `get_dummies()` function rather than Scikit-Learn\'s `OneHotEncoder`.\n\n\n```python\nX_train = pd.get_dummies(X_train,columns=["cntry","gndr"])\nX_test = pd.get_dummies(X_test,columns=["cntry","gndr"])\n```\n\n\n\nNext, we should also consider standardizing (scale to zero mean and unit variance) the data because linear models tend to work better with data on similar scales and this way we will also be able to directly compare the coefficients from the linear model and use them as a measure of feature importance. For tree-based methods (as in the gradient boosted ensemble we will use later) this should not matter, but I should not hurt either. The default `StandardScaler` in Sci-kit Learn is not very robust to outliers though; therefore, we will use a `PowerTransformer` instead, which transforms the data into a Gaussian-like shape and also applies zero-mean-unit-variance normalization. \n\nRegarding all the missing values (NaNs), we have two options: We could either drop the rows containing missing values or impute them. Dropping would mean that we would lose not only one answer but an entire respondent who may have given valid answers to many other questions, so I would first try to impute the values. Since we\'ve already seen before that some answers are distributed in a rather skewed way, I will impute missing values with the column median rather than the mean.\n\n\n```python\nfrom sklearn.preprocessing import PowerTransformer\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.feature_selection import SelectKBest\nfrom sklearn.feature_selection import f_regression\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\n\n# get column/variable names for the different preprocessing operations\nnum = X_train.loc[:,"nwspol":"impfun"].columns\ncat = X_train.loc[:,"cntry_AT":"gndr_2"].columns\n\n# set up the preprocessing pipeline with imputing and power-tranforming for numerical values\npreprocessing = ColumnTransformer(transformers=[\n ("numpipe",Pipeline(steps=[\n ("impute",SimpleImputer(strategy="median")),\n ("transform",PowerTransformer(standardize=True)) \n ]),num),\n ("onehot","passthrough",cat)\n])\n\n\n# since we do not have any hyper-parameters to tune within the preprocessing,\n# we will preprocess the data here and not within a single complete pipeine\n# so that we save time in the gridsearch\nX_train_prep = pd.DataFrame(preprocessing.fit_transform(X_train), columns=X_train.columns, index=X_train.index)\nX_test_prep = pd.DataFrame(preprocessing.transform(X_test), columns=X_test.columns,index=X_test.index)\n\n```\n\nWe need a way to recover the names of the features that have been selected:\n\n\n```python\nselect = SelectKBest(score_func=f_regression,k=50).fit(X_train_prep,y_train)\n# get the indices of the 50 highest scores\nidx = select.scores_.argsort()[-50:] \n# get the corresponding feature names\nselected_features = X_train_prep.columns[idx]\nprint(selected_features)\n```\n\n Index([\'impfun\', \'cflsenr\', \'iorgact\', \'hmsacld\', \'ipcrtiv\', \'cptppola\',\n \'cntry_RU\', \'trstun\', \'psppsgva\', \'impcntr\', \'iphlppl\', \'imsmetn\',\n \'imdfetn\', \'slvuemp\', \'hhmmb\', \'lkuemp\', \'iplylfr\', \'psppipla\',\n \'trstprt\', \'hmsfmlsh\', \'trstplt\', \'atchctr\', \'wkdcorga\', \'atcherp\',\n \'freehms\', \'aesfdrk\', \'netusoft\', \'sclact\', \'imbgeco\', \'hinctnta\',\n \'stfgov\', \'pplhlp\', \'trstprl\', \'hlthhmp\', \'imueclt\', \'trstlgl\',\n \'inprdsc\', \'ppltrst\', \'sclmeet\', \'imwbcnt\', \'slvpens\', \'stfedu\',\n \'pplfair\', \'trstplc\', \'stfhlth\', \'stfdem\', \'lknemny\', \'stfeco\',\n \'hincfel\', \'health\'],\n dtype=\'object\')\n \n\n### Interpretation of linear models \nWith all the neccessary preprocessing in place, we can train the first model. Let\'s start with a elastic-net-regularized linear regression.\n\n\n```python\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.model_selection import GridSearchCV\n\n# combine the feature selection with the regression model\nregressor = Pipeline(steps=[\n ("select",SelectKBest(score_func=f_regression,k=50)),\n ("reg",ElasticNet(random_state=42)),\n])\n\n# hyper-parameter options for optimization\nparam_grid = {"reg__alpha": np.logspace(-3,2,6),\n "reg__l1_ratio": [0.0,0.25,0.5,0.75]}\n\n# perform a grid search cross validation for hyper-paramter optimization\nlin_grid = GridSearchCV(regressor,param_grid=param_grid,\n scoring="neg_mean_absolute_error", \n n_jobs=-1, iid=False, \n cv=5,verbose=0).fit(X_train_prep,y_train)\n```\n\nHow did the linear model perform? \n\n\n```python\nfrom sklearn.metrics import mean_absolute_error\n\nypred = lin_grid.predict(X_test_prep)\ntest_err = mean_absolute_error(y_test, ypred)\nprint(f"Best parameters: {lin_grid.best_params_}\\n\\\nCross-validation mean absolute error: {-1*lin_grid.best_score_}\\n\\\nTest mean absolute error: {test_err}")\n```\n\n Best parameters: {\'reg__alpha\': 0.01, \'reg__l1_ratio\': 0.25}\n Cross-validation mean absolute error: 1.1550813768624484\n Test mean absolute error: 1.1353186284187047\n \n\nThe linear model has a mean absolute error around 1.1 to 1.2 when predicting happiness on a scale of 0 to 10. That\'s not a bad start, we\'ll see if we can do better later on. Which features seem to be contributing the most or least to people\'s happiness? In linear models we can directly look at the coefficients: \n\n\n```python\n# get features that were selected \nselected_features = X_train_prep.columns[lin_grid\n .best_estimator_\n .named_steps[\'select\']\n .scores_.argsort()[-50:]].tolist()\n\n# bring them in the right order so that they\'ll match the coefficients\' order\nordered_features = [f for f in X_train_prep.columns if f in selected_features]\n\n# the selected dummy variable "cntry_RU" doesn\'t exist is the variables data frame yet so we add it\nvariables = variables.append(\n pd.DataFrame(np.array(["cntry_RU","Living in Russia",None,None,None,None,None,None,None,None,]).reshape(-1,10),\n columns=variables.columns),\n ignore_index=True\n) \n\n# get the trained model\'s coefficients\ncoef = lin_grid.best_estimator_.named_steps[\'reg\'].coef_\n\n# put it all in a data frame\nfeatures = pd.DataFrame(sorted(zip(coef,\n ordered_features,\n variables.query("Name in @ordered_features").Label,),\n reverse=True),\n columns=["coefficient","feature","question"])\n\n# look at the two happiest and two saddest features\nfeatures.iloc[[0,1,2,-3,-2,-1]]\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
coefficientfeaturequestion
00.247130healthSubjective general health
10.219648stfecoHow satisfied with present state of economy in...
20.213510atchctrHow emotionally attached to [country]
47-0.076225trstpltTrust in politicians
48-0.098587hlthhmpHampered in daily activities by illness/disabi...
49-0.135400lknemnyHow likely not enough money for household nece...
\n
\n\n\n\nWe can see above that the two most important features that contribute to people feeling happy are their health and their satisfaction with the present state of the economy. On the other hand, being hampered by illness and worrying about not having enough money for household necessities is contributing to people being sad. That\'s not surprising overall, but I find it very interesting to see how much each factor contributes.\nThere actually is a handy Python library called `eli5` which can give you this information more easily by just supplying it with the trained estimator and the feature names (it will also show the coefficient for the intercept called ``):\n\n\n```python\nimport eli5\neli5.show_weights(lin_grid.best_estimator_,feature_names=X_train_prep.columns.values)\n```\n\n\n\n\n
\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\t\n\t\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
\n Weight\n Feature
\n +7.441\n \n <BIAS>\n
\n +0.247\n \n health\n
\n +0.220\n \n stfeco\n
\n +0.214\n \n atchctr\n
\n +0.186\n \n hhmmb\n
\n +0.184\n \n hincfel\n
\n +0.155\n \n trstplc\n
\n +0.126\n \n stfedu\n
\n +0.123\n \n sclmeet\n
\n +0.103\n \n sclact\n
\n +0.095\n \n iphlppl\n
\n +0.087\n \n pplfair\n
\n +0.083\n \n stfhlth\n
\n +0.080\n \n imwbcnt\n
\n +0.072\n \n inprdsc\n
\n +0.069\n \n ipcrtiv\n
\n +0.068\n \n wkdcorga\n
\n … 15 more positive …\n
\n … 14 more negative …\n
\n -0.076\n \n trstplt\n
\n -0.099\n \n hlthhmp\n
\n -0.135\n \n lknemny\n
\n
\n\n\nUsing the fitted coefficients, we can also see how each answer of an individual respondent contributes to their predicted happiness score. Let\'s have a look at a random person:\n\n\n```python\n# eli5 can\'t predict with pipelines so we\'ll do the feature selection and model train again manually\nselect = lin_grid.best_estimator_.named_steps[\'select\']\nX_train_sel = pd.DataFrame(select.transform(X_train_prep),columns=ordered_features,index=X_train_prep.index)\nlin_reg = ElasticNet(alpha=0.01, l1_ratio=0.25,random_state=42).fit(X_train_sel,y_train)\nfeature_names = variables.query("Name in @ordered_features").Label.values\n\n# explain the score prediction for respondent 30092\neli5.show_prediction(lin_reg, X_train_sel.iloc[30092], top=(4,3), feature_names=feature_names)\n```\n\n
\n \n \n \t \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\t \n \n \n \n \n \n \n \n \n \n \n \n \n
\n Contribution\n Feature
\n +7.441\n \n <BIAS>\n
\n +0.501\n \n How satisfied with present state of economy in country\n
\n +0.267\n \n State of education in country nowadays\n
\n +0.257\n \n How emotionally attached to [country]\n
\n … 27 more positive …\n
\n … 14 more negative …\n
\n -0.091\n \n How many people with whom you can discuss intimate and personal matters\n
\n -0.175\n \n Hampered in daily activities by illness/disability/infirmity/ mental problem\n
\n -0.267\n \n Trust in the police\n
\n
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHere we can see that this respondent is made happy by her/his satisfaction in the present state of the economy but that her/his (presumably low) trust in the police is reducing the predicted happiness. \n\nThese calculations are very informative and straightforward because they directly "fall out" of the linear model. The values of a feature\'s coefficient informs us about the feature\'s impact. In high-performing, more complicated models like tree-based ensembles or deep neural networks, there is no more direct one-to-one relationship between an input features and the model\'s trained parameters. This makes it difficult to understand the overall impact of features and individual predictions. \n\nThere are, however, general approaches for explaining any kind of machine learning model by making the explanation a model itself. A very powerful concept are SHAP (SHapley Additive exPlanations) values. The idea is the following: How do we know how much each individual feature value contributes to the value of a prediction? We have to calculate each feature\'s marginal contribution for every possible permutation of features. The average of all these marginal contributions is the SHAP value. For more details (an the math), please refer to the [original paper](http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf). Let\'s try this concepts for the linear model. Since the calculation of SHAP values is quite time-consuming, we will demonstrate it on a smaller subset of the training data.\n\n\n```python\nimport shap\n# shap uses javascript\nshap.initjs()\n\n\n# get 100 random index numbers as our random sample\nnp.random.seed(42)\nrnd = np.random.randint(0,X_train_sel.shape[0],100)\n\n# create an explainer and calculate SHAP values for our sample\nexp = shap.KernelExplainer(lin_reg.predict,X_train_sel.iloc[rnd],l1_reg="num_features(50)")\nshap_values = exp.shap_values(X_train_sel.iloc[rnd])\n```\n\n\n```python\n# save the explanation model and shap values bc it takes long to calculate\nimport pickle\npickle.dump(exp,open(b"SHAP.exp","wb"))\npickle.dump(shap_values,open(b"SHAP.val","wb"))\n```\n\n\n```python\n# load the explanation model and shap values\n#import pickle\n#exp = pickle.load(open("SHAP.exp","rb"))\n#shap_values = pickle.load(open("SHAP.val","rb"))\n```\n\n\n```python\n# plot the calculated SHAP values\nfig, ax = plt.subplots(nrows=1,ncols=1,figsize=(12,12))\nplt.yticks(rotation=15)\nshap.summary_plot(shap_values, X_train_sel.iloc[rnd],\n max_display=50, auto_size_plot=False, \n plot_type="violin")\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/output_84_0.png)\n\n\nFor all examples, SHAP values of all features were calculated. A high SHAP value (colored red) means that in the corresponding example, the value of the feature contributed positively to the calculated score, compared to a baseline score (in this case, average happiness of all respondents in the sample). We can now see that the features which have a high impact also have high absolute SHAP values. For example, the `health` feature has highly positive and highly negative SHAP values. This is because a high value of `health` (people feeling more healthy) contributes strongly to their happiness, whereas a low value of `health` (people feeling unhealthy) makes them feel less happy. As before we may want to have a look at individual predictions to see how respondents\' individual answers contribute to their predicted happiness score:\n\n\n```python\n# display the original and transformed values of the answers to features/questions \n# with high SHAP value for one of the randomly sampled respondents\ni = 5\nidx = X_train_sel.iloc[rnd].iloc[i].name\nhigh_SHAP_feats = [v for s,v in sorted(zip(np.abs(shap_values[i,:]),\n X_train_sel.columns.tolist()),\n reverse=True)][:15]\nX_train.loc[[idx]].append(X_train_sel.loc[[idx]])[high_SHAP_feats].head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
healthstfecostfedutrstplciphlpplsclmeetlknemnyhlthhmpstfhlthsclactpplfairinprdsctrstpltwkdcorgaatchctr
435951.0000000.0000001.0000003.000002.0000002.0000001.0000003.0000000.0000004.0000002.0000000.0000000.0000000.0000009.000000
43595-2.420841-2.083101-1.962981-1.43188-2.153134-1.686221-1.2795941.777702-2.0701071.429339-1.671648-2.061476-1.646384-1.7667080.526941
\n
\n\n\n\n\n```python\n# explain a single prediction for one of the randomly sampled respondents\nshap.force_plot(exp.expected_value, shap_values[i,:], X_train_sel.iloc[rnd].iloc[i])\n```\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/forceplot1.PNG)\n\nAs we can see above, this respondent does not feel particularly happy. Her/his low subjective `health`, low satisfaction with the present state of the economy (`stfeco`) and education (`stfedu`) make her/him feel less happy, whereas, little worries about lack of money for household neccassities (`lknemny`) and frequent participation in social activities (`sclact`) make her/him more happy. Now that we have a universal way to explain the impact of our features for the overall model as well as for individual predictions, we can try to get better predictions with more complex models. \n\n\n### Interpretation of gradient-boosted ensembles \nIn the following, we will use an ensemble of gradient-boosted decision trees as implemented in the [XGBoost algorithm](https://en.wikipedia.org/wiki/XGBoost). The idea of [gradient-boosting](https://en.wikipedia.org/wiki/Gradient_boosting) is to sequentially train predictors in a way that tries to fit every new predictor to the residual errors of the previous predictor.\n\n\n```python\nfrom xgboost import XGBRegressor\n\n# combine the feature selection with the XGB regression model\nregressor_xgb = Pipeline(steps=[\n ("select",SelectKBest(score_func=f_regression,k=50)),\n ("reg",XGBRegressor(booster="gbtree",n_jobs=4,\n random_state=42,verbosity=0)),\n])\n\n# hyper-parameter options for optimization\n# since the gridsearch takes a while, I commented out the non-ideal ones I checked\nparam_grid = {"reg__max_depth": [5],#[4,5,6],\n "reg__learning_rate":[0.1], #[0.05,0.1,0.15],\n "reg__n_estimators": [125],}#[10,50,75,100,125,150],}\n\n# perform a grid search cross validation for hyper-paramter optimization\nxgb_grid = GridSearchCV(regressor_xgb,param_grid=param_grid,\n scoring="neg_mean_absolute_error", iid=False, n_jobs=1,\n cv=5,verbose=0).fit(X_train_prep,y_train)\n```\n\n\n```python\nypred = xgb_grid.predict(X_test_prep)\ntest_err = mean_absolute_error(y_test, ypred)\nprint(f"Best parameters: {xgb_grid.best_params_}\\n\\\nCross-validation mean absolute error: {-1*xgb_grid.best_score_}\\n\\\nTest mean absolute error: {test_err}")\n```\n\n Best parameters: {\'reg__learning_rate\': 0.1, \'reg__max_depth\': 5, \'reg__n_estimators\': 125}\n Cross-validation mean absolute error: 1.1224201507398217\n Test mean absolute error: 1.098073806225703\n \n\nThe XGB model has a mean absolute error around 1.1 when predicting happiness on a scale of 0 to 10, a little bit better than the linear model. In tree-based models, there is also a way to assess feature importance:\n\n\n```python\n# get features that were selected \nselected_features_xgb = X_train_prep.columns[xgb_grid\n .best_estimator_\n .named_steps[\'select\']\n .scores_.argsort()[-50:]].tolist()\n\n# bring them in the right order and get the correspondig questions \nordered_features_xgb = [f for f in X_train_prep.columns if f in selected_features_xgb]\nfeature_names_xgb = variables.query("Name in @ordered_features_xgb").Label.values\n\n# because eli5 cannot handle pipelines, train the XGBoost model with the \n# optimized parameeters from grid search cv\nselect = xgb_grid.best_estimator_.named_steps[\'select\']\nX_train_sel_xgb = pd.DataFrame(select.transform(X_train_prep),\n columns=ordered_features_xgb,\n index=X_train_prep.index)\n\nxgb_reg = XGBRegressor(booster="gbtree", n_estimators=125,\n max_depth=5, learning_rate=0.1, n_jobs=4,\n random_state=42,verbosity=0).fit(X_train_sel_xgb,y_train)\n\n# show the weights associated with the features in the XGBoost model\neli5.show_weights(xgb_reg,feature_names=feature_names_xgb)\n```\n\n\n\n\n
\n\n \n \n \n\t\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\t\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
WeightFeature
\n 0.1571\n \n Feeling about household's income nowadays\n
\n 0.0966\n \n Subjective general health\n
\n 0.0834\n \n How likely not enough money for household necessities next 12 months\n
\n 0.0697\n \n How satisfied with present state of economy in country\n
\n 0.0382\n \n How often socially meet with friends, relatives or colleagues\n
\n 0.0299\n \n Number of people living regularly as member of household\n
\n 0.0265\n \n How emotionally attached to [country]\n
\n 0.0234\n \n State of education in country nowadays\n
\n 0.0228\n\t\n Important to be loyal to friends and devote to people close\n
\n 0.0225\n \n Trust in the police\n
\n 0.0213\n \n Take part in social activities compared to others of same age\n
\n 0.0212\n \n Country's cultural life undermined or enriched by immigrants\n
\n 0.0196\n \n Important to help people and care for others well-being\n
\n 0.0185\n \n Hampered in daily activities by illness/disability/infirmity/ mental problem\n
\n 0.0182\n \n How many people with whom you can discuss intimate and personal matters\n
\n 0.0179\n \n State of health services in country nowadays\n
\n 0.0170\n \n Most people try to take advantage of you, or try to be fair\n
\n 0.0144\n \n Feeling of safety of walking alone in local area after dark\n
\n 0.0144\n \n Living in Russia\n
\n 0.0140\n \n Allowed to decide how daily work is organised\n
\n … 30 more …\n
\n
\n\n\nThe weights shown here are different from the ones in a linear model. Instead of considering coefficients which scale each features contribution to the predicted score, the feature importance in tree-based models is calculated by considering to how many (decision tree) splits a feature contributes. If many get split by a particular feature, it is considered more important for the prediction. That does, however, not tell if a feature contributes positively or negatively to the predicted value. \n\nThe `eli5` library has a way to estimate each features contribution for a XGBoost model, which is a bit more complex than for a linear model: The feature weights are calculated by back-tracking the decision paths in trees of an ensemble. All feature weights sum up to the predicted score. Each leaf (end of a tree) has an output score from which an expected score can be assigned to its parent node (like a pseudo-leave score). The change of the expected score from parent to child represents the contribution of one feature on the decision path. See this [blogpost](http://blog.datadive.net/interpreting-random-forests/) for more detail. Let\'s have a look at an individual prediction:\n\n\n```python\n# explain the score prediction for respondent 30092\neli5.show_prediction(xgb_reg, X_train_sel_xgb.iloc[30092], top=(5,4), feature_names=feature_names_xgb)\n```\n\n\n\n
\n \n \n \n \n \n \n \n \n \n \n \n\n \n \n \n\n \n \n \n\n \n \n \n\n \n \n \n\n \n \n \n \n \n \n \n \n \n\n \n \n \n\n \n \n \n\n \n \n \n\n \n
\n Contribution\n Feature
\n +6.938\n \n <BIAS>\n
\n +0.423\n \n How satisfied with present state of economy in country\n
\n +0.348\n \n How emotionally attached to [country]\n
\n +0.202\n \n Feeling about household's income nowadays\n
\n +0.139\n \n State of education in country nowadays\n
\n … 22 more positive …\n
\n … 20 more negative …\n
\n -0.074\n \n Most of the time people helpful or mostly looking out for themselves\n
\n -0.081\n \n Take part in social activities compared to others of same age\n
\n -0.101\n \n Trust in the police\n
\n -0.233\n \n Hampered in daily activities by illness/disability/infirmity/ mental problem\n
\n
\n\nWe can see that similar features as in the linear model contribute to this respondents happiness score but that the order of importance is a little different. In the XGBoost model, satisfaction with the state of education has a smaller positive contribution, while being hampered by illness has a larger negative contribution than in the linear model.\n\nLuckily, the SHAP values provide a model-agnostic method to evaluate feature contributions. We will calculate them for the XGBoost model on the same sample of respondents as for the linear model to facilitate comparison.\n\n\n```python\n# create an explainer and calculate the XGBoost model\'s SHAP values for the same sample\nexp_xgb = shap.TreeExplainer(xgb_reg,X_train_sel_xgb.iloc[rnd],feature_dependence="independent")\nshap_values_xgb = exp_xgb.shap_values(X_train_sel_xgb.iloc[rnd])\n```\n\n\n```python\n# save the XGB explanation model and shap values\nimport pickle\npickle.dump(exp_xgb,open(b"SHAP.xgb.exp","wb"))\npickle.dump(shap_values_xgb,open(b"SHAP.xgb.val","wb"))\n```\n\n\n```python\n# load the XGBexplanation model and shap values\n#import pickle\n#exp = pickle.load(open("SHAP.xgb.exp","rb"))\n#shap_values = pickle.load(open("SHAP.xgb.val","rb"))\n```\n\n\n```python\n# plot the calculated SHAP values\nfig, ax = plt.subplots(nrows=1,ncols=1,figsize=(12,12))\nplt.yticks(rotation=15)\nshap.summary_plot(shap_values_xgb, X_train_sel_xgb.iloc[rnd],\n max_display=50, auto_size_plot=False, \n plot_type="violin")\n```\n\n\n![](https://pb-data-blogposts.s3.eu-central-1.amazonaws.com/insights-from-the-european-social-survey-8/output_100_0.png)\n\n\nCompared to the linear model, the same features are considered important, even though the order is not exactly the same.\nWhat about the individual prediction?\n\n\n```python\n# display the original and transformed values of the answers to features/questions \n# with high SHAP value for one of the randomly sampled respondents\ni = 5\nidx = X_train_sel_xgb.iloc[rnd].iloc[i].name\nhigh_SHAP_feats = [v for s,v in sorted(zip(np.abs(shap_values_xgb[i,:]),\n X_train_sel_xgb.columns.tolist()),\n reverse=True)][:15]\nX_train.loc[[idx]].append(X_train_sel_xgb.loc[[idx]])[high_SHAP_feats].head()\n```\n\n\n\n\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
healthstfecosclmeetsclactinprdsclknemnyiphlppliplylfrtrstplchlthhmpstfdemhhmmbtrstuntrstpltslvpens
435951.0000000.0000002.0000004.0000000.0000001.0000002.0000002.0000003.000003.0000000.0000003.0000000.0000000.0000000.000000
43595-2.420841-2.083101-1.6862211.429339-2.061476-1.279594-2.153134-2.191386-1.431881.777702-2.0277330.483008-1.910358-1.646384-1.946657
\n
\n\n\n\n\n```python\n# explain a single prediction of the XGBoost model for one of the randomly sampled respondents\nshap.force_plot(exp_xgb.expected_value, shap_values_xgb[i,:], X_train_sel_xgb.iloc[rnd].iloc[i])\n```\n![](https://raw.githubusercontent.com/Pascal-Bliem/european-social-survey/master/figures/forceplot.png)\n\nEven though the predicted happiness score is a little lower than for the linear model, the contribution of each feature is more or less identical with the linear model. This is actually really cool, because it shows that SHAP values provide us with a reliable and universally applicable method of explaining arbitrarily complex machine learning models. This does, to some extent, resolve the trade-off between complex but accurate vs. simple but interpretable models.\n\n### Conclusion \n\nIn this work, we have analyzed data from the 8th European Social Survey. We went through a detailed [data cleaning](#data) procedure in which suitable variables corresponding to survey questions were selected, invalid data points were removed, and data scales were adjusted, if necessary. \n\nWe explored how different questions correlate with each other and how the answers are distributed among the participating countries by employing an interactive [visualization](#viz). We saw certain patterns which may be described as attitudes, e.g. positive correlations between xeno- and homophobia, or political and climate responsibility. \n\nFurthermore, we performed [statistical tests](#stats) to evaluate the statistical significance and effect size of differences in answers of respondents from different countries. We saw, for example, that there is no significant difference in the reported happiness of German and Swedish respondents, whereas there is a very large difference in how important Italian and French respondents consider personal success and recognition.\n\nFinally, we employed [machine learning](#ml) methods to predict how happy respondents considers themselves, based on how they answered other questions. We compared the predictions of linear and gradient-boosted decision tree models in terms of their predictive power and interpretability. We then used SHAP values to explain how each feature contributes to an individual prediction for the two models. Good subjective health, high satisfaction with the present state of the economy, and an emotional attachment to the home country tend to contribute positively to the respondents\' happiness. On the contrary, worries about not having enough money for household necessities, or being hampered by illness or disability contributes negatively to the respondents\' happiness.\n\nThat\'s it! Thanks a lot for following me through this project and thanks as well to the organizers and participants of this survey and to the developers of the many great open-source tools I used in this project.\n')},function(e,t,n){"use strict";n.r(t);var a=n(2);t.default=new a.a("Social network analytics from scratch","Coding the algorithms and data structures for graph analysis",new Date("2019-09-18"),"https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/comsFB.png","Identifying communities and user importance.",["Data Science & AI/ML","Learning"],"A while ago, I felt that, besides advancing my data science knowledge, I also wanted to brush up on my general software engineering skills. Since I had a Java 101 course in my bachelors and Java being somewhat of the embodiment of object orientation, I picked the [*Object Oriented Java Programming: Data Structures and Beyond Specialization*](https://www.coursera.org/specializations/java-object-oriented) on [Coursera.org](https://www.coursera.org/), which consists of five intermediate courses on algorithms and data structures. Besides sorting, searching, lists, and trees, they also focused a whole lot on graphs. This work here is part of the open-ended capstone project of this specialization. I coded up the algorithms from scratch in Java and used NetworkX in Python along with a self-written test class to test my code. You can find all source code on my [Github](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance).\n\nI was interested in learning about structures within social networks. Among the users of a network, can we identify communities? Can we find out which users have a special role or are especially important within these communities? I will try to assess how important users are in a network of over 80000 Twitter users, based on their [in-degree centrality](https://en.wikipedia.org/wiki/Centrality#Degree_centrality) and [Page-Rank](https://en.wikipedia.org/wiki/PageRank) score. In the second part, we will try to identify communities within the Facebook data of 783 UC San Diego students, using the [Girvan-Newmann algorithm](https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm).\n\nHere's an overview of what I want to cover:\n* [Analysis](#analysis)\n* [Class overview](#class-overview)\n* [Testing](#testing)\n* [Acknowledgement](#acknowledgement)\n\n## Analysis\nYou can also read this part with the actual code I used in this [Jupyter Notebook](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/blob/master/NetworkAnalysis.ipynb).\n### Important users on Twitter\nTwitter is an interesting network to study because it can be represented by a directed graph. Not all users who follow someone are being followed back. Some important or influential people may have a lot more followers, meaning a lot more incoming edges. \n\nThat's something that can be easily quantified, e.g. by a user's in-degree centrality (how many followers?). But maybe one wants to consider certain users as more important if they are followed by other important users. That's the idea of Google's famous Page-Rank algorithm which considers how much of its own score each user contributes to the score of another user. Both these scores are computed by the Java code in this project.\n\nThe anonymous Twitter data we have here is is a social circle data set from the SNAP database and can be found [here](https://snap.stanford.edu/data/ego-Twitter.html). The graph contains 81306 vertices and 1768149 edges.\n\nAfter calculating both metrics (stored in this [file](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/blob/master/data/Analysis/twitterIDCPR.txt)) and scaling them on the same scale (between 0 and 100), we can observe that almost all users have a very low in-degree centrality and Page-Rank score and only very few users have very high scores. That agrees with the intuition that most users are just \"normal people\" who use the network to connect with their friends and only very few users can be considered as influential or important individuals. We can also see that the differences between in-degree centrality and Page-Rank score are very small. Since most users have very low scores, contributions from \"important\" users to other \"important\" user which make them even more important (the basic idea of the Page-Rank algorithm), seem to be practically irrelevant here. Both these observations, low scores for most users, and almost no difference between these score metrics, become clearer when they are plotted (on a log scale): \n\n![Users' in-degree centrality and Page-rank.](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/userIDCPR.png)\n\nWe have seen that on this Twitter data, it is easy to identify the few important users in this network by either in-degree centrality or Page-Rank.\n\n### Communities on Facebook\nPresumably, users are parts of many kinds of social communities on Facebook, e.g. you may be connected to many of your current and former coworkers and you may be the only connection between these groups. This follows the idea that there are many connections within communities and fewer among communities. Hence, the edges between communities are passed by many shortest paths between users which means the [betweenness centrality](https://link.springer.com/referenceworkentry/10.1007%2F978-1-4419-9863-7_874) of these edges is high. \n\nThe [Girvan-Newmann algorithm](https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm) tries to separate a network into communities by cutting these edges with high betweenness centrality. It calculates edge betweenness centrality for every edge and then removes the one with the highest score. This procedure is repeated until the desired number of communities is identified, or all the way down until all edges are removed and only single users/vertices remain.\n\nThe anonymous Facebook data we have here is is a \"snapshot\" of the network of students from the University of California in San Diego (UCSD) in 2005. The data was kindly provided by the hosts of the MOOC specialization *Object Oriented Java Programming: Data Structures and Beyond* on [Coursera.org](https://www.coursera.org/specializations/java-object-oriented). The graph contains 783 vertices and 1892 edges and is only a small fraction of the original data (due to limitations in computational power).\n\nLet's visualize the original network. We can see that most people seem to have many connections with many other people but there are also some users on the outskirts of this network who are connected to only a few other users. \n![Visualization of user connections in the Facebook data.](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/origFB.png)\nNow let's look at the network after running the Girvan-Newmann algorithm and identifying 30 communities (by removing 451 edges). We can also calculate the degree centrality for each user/vertex and use it for color-mapping.\n![Visualization of identified user community and users' degree centrality in the Facebook data.](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/comsFB.png)\n\nWe can see that the Girvan-Newmann algorithm identified two very large communities and a couple of smaller communities. Within the larger communities, there are several users that have a high degree centrality, suggesting that they are more important or influential within these communities. Since the data we use here is anonymized, we can only speculate about the nature of these communities. Maybe they represent student unions, study and sport clubs, or people in different degree programs. \n\nIn a real example, we could now look into what it is that the users in these communities have in common. When working with real social network data, community detection algorithms and user-importance metrics can help us to look for patterns, connections, and trends in these networks with millions of users.\n\n### Performance\nYou can also read this part with the actual code I used in this [Jupyter Notebook](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/blob/master/PerformanceAnalysis.ipynb).\n\nThe test examples can be found [here](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/tree/master/data/TestPerformance) directory, along with the [python code](https://github.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/blob/master/data/TestGraphs/TestGraphMaker.ipynb) that created them. \n\nTests were performed on graphs ranging from 100 to 3000 vertices in steps of 100 (number of edges increases quadratically) and the execution time was recorded. Each case was tested 50 times to get better statistics. The Girvan-Newmann algorithm for community detection, however, was only tested for one iteration on the 100 to 1000 vertices graphs and only 3 times due to its much higher complexity.\n\n#### In-degree centrality\nThis quantity is easy to calculate if edges are stored as adjacency lists. We just iterate through all vertices' adjacency lists and count +1 for every time a vertex has an incoming edge, then divide every value by V-1, where V is the number of vertices. Hence, the algorithm should run in O(E), where E is the number of edges. \n\n![](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/perfIDC.png)\n\n#### Page Rank\nThe algorithm calculates how much \"contribution\" of PageRank each vertex gets over any incoming edge it has from another vertex so the complexity is O(E+V). \n\n![](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/perfPR.png)\n\n#### Tarjan algorithm fro detecting strongly connected components\nThe algorithm is based on a recursive depth-first-search, basically going along all paths in a component until it finds no more edges to go along, which means it found a full component. It's complexity is O(E+V). \n\n![](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/perfSCC.png)\n\n#### Girvan-Newman algorithm for detecting communities\nThe algorithm removes edges with highest betweenness centrality to split apart communities. It first calculates the edge betweenness centrality for all edges, which means that it has to find all shortest paths from any vertex to any other vertex in the graph an see how many of these paths go through an edge. If there is more than one shortest path from one vertex to another, the paths have to be counted fractionally to an edges betweenness centrality. Finding shortest paths is done by a breath-first-search, complexity O(E+V), and it's done for every vertex combination, so finding all shortest paths should be of complexity O((E+V)$^{2}$). That is for one iteration (we are only looking at one iteration here), so it looks somewhat quadratic. For all possible iterations, meaning removing all eges it would look qubic, and for a realistic case where we want to find a certain number of communities, probably somewhere in between quadratic and cubic big O. \n\n![](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/perfGN.png)\n\nLooking at all these algorithms in comparison, it becomes clear that the community detection is the bottle neck in this analysis: \n\n![](https://raw.githubusercontent.com/Pascal-Bliem/social-network-analysis---communities-and-user-importance/master/data/Figures/perfAll.png)\n\n## Acknowledgement\n\nI'd like to thank the instructors of the Coursera MOOC, Christine Alvarado, Mia Minnes, and Leo Porter, for putting together a really interesting specialization. I learned a lot and had quite some fun.\nAlso thanks to whoever read this till the very end :)\n")},,,,,,function(e,t,n){"use strict";n.r(t);var a=n(1),o=n.n(a),i=n(40),s=n.n(i),r=n(37),l=n(8),d=(n(52),n(28)),h=n.n(d),c=(n(53),n(54),n(41)),p=n.n(c),u=n(3),m=n.n(u),f=n(0),g=function(e){var t=e.fullyHideHeight,n=e.initialFullyHide;return Object(a.useEffect)((function(){var e=document.getElementById("navbar"),n=m.a.navbarHide,a=m.a.navbarHideFully,o=0,i=function(i){window.scrollY1024?(e.classList.add(a),o=window.scrollY):window.scrollY>o?(e.classList.remove(a),e.classList.add(n),o=window.scrollY):window.scrollY