{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# DL8.5 used to perform predictive clustering\nThis example illustrates how to use a user-specified error function to perform predictive\nclustering. The PyDL8.5 library also provides an implementation of predictive clustering\nthat does not require the use of user-specified error function. \nCheck the DL85Cluster class for this implementation.\n\nThe main purpose of this example is to show how users of the library can implement their\nown decision tree learning task using PyDL8.5's interface for writing error functions.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.neighbors import DistanceMetric\nimport time\nfrom dl85 import DL85Predictor\n\ndataset = np.genfromtxt(\"../datasets/anneal.txt\", delimiter=' ')\nX = dataset[:, 1:]\nX_train, X_test = train_test_split(X, test_size=0.2, random_state=0)\n\n\nprint(\"############################################################################################\\n\"\n      \"#      DL8.5 clustering : user specific error function and leaves' values assignment       #\\n\"\n      \"############################################################################################\")\n\n# The quality of every cluster is determined using the Euclidean distance.\neucl_dist = DistanceMetric.get_metric('euclidean')\n\n\n# user error function\ndef error(tids):\n    # collect the complete examples identified using the tids. \n    X_subset = X_train[list(tids),:]\n    # determine the centroid of the cluster\n    centroid = np.mean(X_subset, axis=0)\n    # calculate the distances towards centroid\n    distances = eucl_dist.pairwise(X_subset, [centroid])\n    # return the sum of distances as the error\n    return float(sum(distances))\n\n\n# user leaf assignment\ndef leaf_value(tids):\n    # The prediction for every leaf is the centroid of the cluster\n    return np.mean(X.take(list(tids)))\n\n\n# Change the parameters of the algorithm as desired.\nclf = DL85Predictor(max_depth=2, min_sup=5, error_function=error, leaf_value_function=leaf_value, time_limit=600)\n\nstart = time.perf_counter()\nprint(\"Model building...\")\nclf.fit(X_train)\nduration = time.perf_counter() - start\nprint(\"Model built. Duration of the search =\", round(duration, 4))\npredicted = clf.predict(X_test)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}