trustedanalytics · dmsuehir · Jan 6, 2017 · Jan 6, 2017 · Jan 6, 2017
diff --git a/regression-tests/daaltkregtests/lib/port.ini b/regression-tests/daaltkregtests/lib/port.ini
@@ -0,0 +1,32 @@
+[port]
+__main__.arimaxtest.test_arima_scoring = 9100
+__main__.arimaxtest.test_arx_scoring = 9101
+__main__.arimaxtest.test_arimax_scoring = 9102
+__main__.arimaxtest.test_max_scoring = 9103
+__main__.kmeansclustering.test_model_scoring = 9104
+__main__.gmm.test_model_scoring = 9105
+__main__.ldamodeltest.test_model_scoring = 9106
+__main__.linearregression.test_model_scoring = 9107
+__main__.logisticregression.test_model_scoring = 9108
+__main__.naivebayes.test_model_scoring = 9109
+__main__.principalcomponent.test_model_scoring = 9110
+__main__.randomforest.test_class_scoring = 9111
+__main__.randomforest.test_reg_scoring = 9112
+__main__.svmscoretest.test_model_scoring = 9113
+# Following contain same port numbers as the above but with different key
+# When the tests belong to a module, testcase id changes
+arimax_test.arimaxtest.test_arima_scoring = 9100
+arimax_test.arimaxtest.test_arx_scoring = 9101
+arimax_test.arimaxtest.test_arimax_scoring = 9102
+arimax_test.arimaxtest.test_max_scoring = 9103
+kmeans_test.kmeansclustering.test_model_scoring = 9104
+gmm_test.gmm.test_model_scoring = 9105
+lda.ldamodeltest.test_model_scoring = 9106
+linear_regression_test.linearregression.test_model_scoring = 9107
+logistic_regression_test.logisticregression.test_model_scoring = 9108
+naive_bayes_test.naivebayes.test_model_scoring = 9109
+pca_test.principalcomponent.test_model_scoring = 9110
+random_forest_test.randomforest.test_class_scoring = 9111
+random_forest_test.randomforest.test_reg_scoring = 9112
+svm_model_test.svmscoretest.test_model_scoring = 9113
+
diff --git a/regression-tests/daaltkregtests/lib/score_utils.py b/regression-tests/daaltkregtests/lib/score_utils.py
@@ -0,0 +1,84 @@
+# vim: set encoding=utf-8
+
+#  Copyright (c) 2016 Intel Corporation 
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+""" Library to support scoring in TAP for the ATK service """
+import subprocess as sp
+import requests
+import time
+import signal
+import os
+import config
+from ConfigParser import SafeConfigParser
+
+class scorer(object):
+
+    def __init__(self, model_path, port_id, host=config.scoring_engine_host):
+        """Set up the server location, port and model file"""
+        self.hdfs_path = model_path
+        self.name = host.split('.')[0]
+        self.host = host
+        #set port
+        config = SafeConfigParser()
+        filepath = os.path.abspath(os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "port.ini"))
+        config.read(filepath)
+        self.port = config.get('port', port_id)
+        self.scoring_process = None
+
+    def __enter__(self):
+        """Activate the Server"""
+        #change current working directory to point at scoring_engine dir
+        run_path =  os.path.abspath(os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "..", "..", "..", "scoring", "scoring_engine"))
+
+        #keep track of cwd for future
+        test_dir = os.getcwd()
+        os.chdir(run_path)
+        # make a new process group
+        self.scoring_process = sp.Popen(
+            ["./bin/model-scoring.sh", "-Dtrustedanalytics.scoring-engine.archive-mar=%s" % self.hdfs_path,
+             "-Dtrustedanalytics.scoring.port=%s" % self.port],
+            preexec_fn=os.setsid)
+
+        #restore cwd
+        os.chdir(test_dir)
+
+        # wait for server to start
+        time.sleep(20)
+        return self
+
+    def __exit__(self, *args):
+        """Teardown the server"""
+        # Get the process group to kill all of the suprocesses
+        pgrp = os.getpgid(self.scoring_process.pid)
+        os.killpg(pgrp, signal.SIGKILL)
+
+    def score(self, data_val):
+        """score the json set data_val"""
+
+        # Magic headers to make the server respond appropriately
+        # Ask the head of scoring why these
+        headers = {'Content-type': 'application/json',
+                   'Accept': 'application/json,text/plain'}
+
+        scoring_host = self.host + ":" + self.port
+        submit_string = 'http://'+scoring_host+'/v2/score'
+        response = requests.post(submit_string, json={"records":data_val}, headers=headers)
+        return response
diff --git a/regression-tests/daaltkregtests/testcases/scoretests/kmeans_test.py b/regression-tests/daaltkregtests/testcases/scoretests/kmeans_test.py
@@ -0,0 +1,66 @@
+# vim: set encoding=utf-8
+
+#  Copyright (c) 2016 Intel Corporation 
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+""" test cases for the kmeans clustering algorithm """
+import unittest
+import time
+import os
+from daaltkregtests.lib import daaltk_test
+from daaltkregtests.lib import score_utils
+from daaltkregtests.lib import config
+
+
+class KMeansClustering(daaltk_test.DaalTKTestCase):
+
+    def setUp(self):
+        """Import the files to test against."""
+        super(KMeansClustering, self).setUp()
+        schema = [("Vec1", float),
+                  ("Vec2", float),
+                  ("Vec3", float),
+                  ("Vec4", float),
+                  ("Vec5", float),
+                  ("term", str)]
+
+        self.frame_train = self.context.frame.import_csv(
+            self.get_file("kmeans_train.csv"), schema=schema)
+        self.frame_test = self.context.frame.import_csv(
+            self.get_file("kmeans_test.csv"), schema=schema)
+
+    @unittest.skip("daaltk: kmeans scoring engine produces different result than predict")
+    def test_model_scoring(self):
+        """Tests standard usage of the kmeans cluster algorithm."""
+        kmodel = self.context.daaltk.models.clustering.kmeans.train(
+            self.frame_train, ["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"], k=5)
+
+        result_frame = kmodel.predict(self.frame_test)
+        test_rows = result_frame.to_pandas(50)
+        result = kmodel.export_to_mar(self.get_export_file(self.get_name("daaltk_kmeans")))
+
+        with score_utils.scorer(
+                result, self.id()) as scorer:
+            for index, row in test_rows.iterrows():
+                res = scorer.score(
+                    [dict(zip(["Vec1", "Vec2", "Vec3", "Vec4", "Vec5"],
+                    list(row[0:5])))])
+
+                self.assertEqual(row.predicted_cluster, res.json()["data"][0]['score'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/regression-tests/daaltkregtests/testcases/scoretests/linear_regression_test.py b/regression-tests/daaltkregtests/testcases/scoretests/linear_regression_test.py
@@ -0,0 +1,62 @@
+# vim: set encoding=utf-8
+
+#  Copyright (c) 2016 Intel Corporation 
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+""" Tests Linear Regression scoring engine """
+import unittest
+import os
+from daaltkregtests.lib import daaltk_test
+from daaltkregtests.lib import score_utils
+
+
+class LinearRegression(daaltk_test.DaalTKTestCase):
+
+    def setUp(self):
+        """Build test frame"""
+        super(LinearRegression, self).setUp()
+        dataset = self.get_file("linear_regression_gen.csv")
+        schema = [("c1", float),
+                  ("c2", float),
+                  ("c3", float),
+                  ("c4", float),
+                  ("label", float)]
+
+        self.frame = self.context.frame.import_csv(
+            dataset, schema=schema)
+
+    def test_model_scoring(self):
+        """Test publishing a linear regression model"""
+        model = self.context.daaltk.models.regression.linear_regression.train(self.frame, "label", ['c1', 'c2', 'c3', 'c4'])
+
+        predict = model.predict(self.frame, ['c1', 'c2', 'c3', 'c4'])
+        test_rows = predict.to_pandas(predict.count())
+
+        file_name = self.get_name("linear_regression")
+        model_path = model.export_to_mar(self.get_export_file(file_name))
+        with score_utils.scorer(
+                model_path, self.id()) as scorer:
+            for index, row in test_rows.iterrows():
+                res = scorer.score(
+                    [dict(zip(["c1", "c2", "c3", "c4"], list(row[0:4])))])
+                self.assertAlmostEqual(
+                    row["predict_label"], res.json()["data"][0]['score'])
+
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/regression-tests/daaltkregtests/testcases/scoretests/naive_bayes_test.py b/regression-tests/daaltkregtests/testcases/scoretests/naive_bayes_test.py
@@ -0,0 +1,59 @@
+# vim: set encoding=utf-8
+
+#  Copyright (c) 2016 Intel Corporation 
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+""" Tests Naive Bayes Model against known values.  """
+import unittest
+import os
+from daaltkregtests.lib import daaltk_test
+from daaltkregtests.lib import score_utils
+
+
+class NaiveBayes(daaltk_test.DaalTKTestCase):
+
+    def setUp(self):
+        """Build the frames needed for the tests."""
+        super(NaiveBayes, self).setUp()
+
+        dataset = self.get_file("naive_bayes.csv")
+        schema = [("label", int),
+                  ("f1", int),
+                  ("f2", int),
+                  ("f3", int)]
+        self.frame = self.context.frame.import_csv(dataset, schema=schema)
+
+    def test_model_scoring(self):
+        """Test training intializes theta, pi and labels"""
+        model = self.context.daaltk.models.classification.naive_bayes.train(self.frame, "label", ['f1', 'f2', 'f3'])
+
+        res = model.predict(self.frame, ['f1', 'f2', 'f3'])
+
+        analysis = res.to_pandas()
+        file_name = self.get_name("daal_naive_bayes")
+        model_path = model.export_to_mar(self.get_export_file(file_name))
+        with score_utils.scorer(
+                model_path, self.id()) as scorer:
+            for index, row in analysis.iterrows():
+                r = scorer.score(
+                    [dict(zip(['f1', 'f2', 'f3'],
+                    map(lambda x: int(x), (row[1:4]))))])
+                self.assertEqual(
+                    r.json()["data"][0]['score'], row.predicted_class)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/regression-tests/daaltkregtests/testcases/scoretests/pca_test.py b/regression-tests/daaltkregtests/testcases/scoretests/pca_test.py
@@ -0,0 +1,69 @@
+# vim: set encoding=utf-8
+
+#  Copyright (c) 2016 Intel Corporation 
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+import unittest
+from daaltkregtests.lib import daaltk_test
+from daaltkregtests.lib import score_utils
+
+
+class PrincipalComponent(daaltk_test.DaalTKTestCase):
+
+    def setUp(self):
+        super(PrincipalComponent, self).setUp()
+        schema = [("X1", int),
+                  ("X2", int),
+                  ("X3", int),
+                  ("X4", int),
+                  ("X5", int),
+                  ("X6", int),
+                  ("X7", int),
+                  ("X8", int),
+                  ("X9", int),
+                  ("X10", int)]
+        pca_traindata = self.get_file("pcadata.csv")
+        self.frame = self.context.frame.import_csv(pca_traindata, schema=schema)
+
+    def test_model_scoring(self):
+        """Test pca scoring"""
+        model = self.context.daaltk.models.dimreduction.pca.train(
+            self.frame,
+            ["X1", "X2", "X3", "X4", "X5",
+            "X6", "X7", "X8", "X9", "X10"],
+            mean_centered=False, k=10)
+
+        file_name = self.get_name("pca")
+        model_path = model.export_to_mar(self.get_export_file(file_name))
+
+        with score_utils.scorer(
+                model_path, self.id()) as scorer:
+            baseline = model.predict(self.frame, mean_centered=False)
+            testvals = baseline.to_pandas(50)
+
+            for index, row in testvals.iterrows():
+                r = scorer.score(
+                    [dict(zip(["X1", "X2", "X3", "X4", "X5",
+                               "X6", "X7", "X8", "X9", "X10"],
+                    map(lambda x: x, row[0:10])))])
+                print "results: " + str(r.json()["data"][-1]["principal_components"])
+                print "row: " + str(row[10:])
+                map(lambda x, y: self.assertAlmostEqual(float(x),float(y)),
+                    r.json()["data"][-1]["principal_components"], row[10:])
+
+
+if __name__ == '__main__':
+    unittest.main()