on 07-19-2016 3:52 PM
Hi,
I would like to use Random Forest from PAL for a Regression Task. I used the example as shown at Random Forest - SAP HANA Predictive Analysis Library (PAL) - SAP Library and it works perfect for classification.
Unfortunatly there is no example for regression so I'm not sure what to with the Confusion Matrix. The Confusion Matrix Table is a parameter expected by the RANDOMFORESTTRAIN function but can not be used with regression. If I try to use RANDOMFORESTTRAIN for an regression task anyway I get an error: [2620] executor: plan operation failed;Can not insert data from temp table "SCHEMA:P6_578EC76B35627559E10000000A443055 (t -1)" into table ""SCHEMA"."PAL_RF_CONFUSION_TBL"".
I already tried to use RANDOMFORESTTRAIN without the Confusion Matrix as a parameter, but then also get an error.
Has someone successfully implemented PAL Random Forest for regression? An example would be great.
Thanks a lot in adavance
Hi, a working example is attached. By the way, which HANA SPS are you using? We have some bug fixing and performance for random forest in recent revisions, it is recommended to upgrade before use.
Best regards,
Xingtian
SET SCHEMA DM_PAL;
DROP TYPE PAL_RF_DATA_T;
CREATE TYPE PAL_RF_DATA_T AS TABLE(
"OUTLOOK" VARCHAR(20),
"TEMP" DOUBLE,
"HUMIDITY" DOUBLE,
"WINDY" VARCHAR(10),
"CLASS" double
);
DROP TYPE PAL_RF_MODEL_T;
CREATE TYPE PAL_RF_MODEL_T AS TABLE(
"ID" INTEGER,
"TREEINDEX" INTEGER,
"MODEL" VARCHAR(5000)
);
DROP TYPE PAL_RF_VAR_IMP_T;
CREATE TYPE PAL_RF_VAR_IMP_T AS TABLE(
"VAR" VARCHAR(100),
"IMP" DOUBLE
);
DROP TYPE PAL_RF_ERR_RATE_T;
CREATE TYPE PAL_RF_ERR_RATE_T AS TABLE(
"TREEINDEX" INTEGER,
"ERR" DOUBLE
);
DROP TYPE PAL_RF_CONFUSION_T;
CREATE TYPE PAL_RF_CONFUSION_T AS TABLE(
"ID" INTEGER,
"CONTENT" VARCHAR(1000)
);
DROP TYPE PAL_CONTROL_T;
CREATE TYPE PAL_CONTROL_T AS TABLE(
"NAME" VARCHAR (100),
"INTARGS" INTEGER,
"DOUBLEARGS" DOUBLE,
"STRINGARGS" VARCHAR(100)
);
DROP TABLE PAL_RF_PDATA_TBL;
CREATE COLUMN TABLE PAL_RF_PDATA_TBL(
"POSITION" INT,
"SCHEMA_NAME" NVARCHAR(256),
"TYPE_NAME" NVARCHAR(256),
"PARAMETER_TYPE" VARCHAR(7)
);
INSERT INTO PAL_RF_PDATA_TBL VALUES (1, 'DM_PAL', 'PAL_RF_DATA_T', 'in');
INSERT INTO PAL_RF_PDATA_TBL VALUES (2, 'DM_PAL', 'PAL_CONTROL_T', 'in');
INSERT INTO PAL_RF_PDATA_TBL VALUES (3, 'DM_PAL', 'PAL_RF_MODEL_T', 'out');
INSERT INTO PAL_RF_PDATA_TBL VALUES (4, 'DM_PAL', 'PAL_RF_VAR_IMP_T', 'out');
INSERT INTO PAL_RF_PDATA_TBL VALUES (5, 'DM_PAL', 'PAL_RF_ERR_RATE_T', 'out');
INSERT INTO PAL_RF_PDATA_TBL VALUES (6, 'DM_PAL', 'PAL_RF_CONFUSION_T', 'out');
CALL "SYS".AFLLANG_WRAPPER_PROCEDURE_DROP('DM_PAL', 'PAL_RF_TRAINING_PROC');
CALL "SYS".AFLLANG_WRAPPER_PROCEDURE_CREATE('AFLPAL', 'RANDOMFORESTTRAIN', 'DM_PAL', 'PAL_RF_TRAINING_PROC', PAL_RF_PDATA_TBL);
DROP TABLE PAL_RF_DATA_TBL;
CREATE COLUMN TABLE PAL_RF_DATA_TBL LIKE PAL_RF_DATA_T;
INSERT INTO PAL_RF_DATA_TBL VALUES ('Sunny', 75, 70, 'Yes', 1.5);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Sunny', null, 90, 'Yes', 0.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Sunny', 85, null, 'No', 0.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Sunny', 72, 95, 'No', 0.0);
INSERT INTO PAL_RF_DATA_TBL VALUES (null, null, 70, null, 1.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Overcast', 72, 90, 'Yes', 0.5);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Overcast', 83, 78, 'No', 1.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Overcast', 64, 65, 'Yes', 1.5);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Overcast', 81, 75, 'No', 1.0);
INSERT INTO PAL_RF_DATA_TBL VALUES (null, 71, 80, 'Yes', 0.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Rain', 65, 70, 'Yes', 0.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Rain', 75, 80, 'No', 1.0);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Rain', 68, 80, 'No', 0.5);
INSERT INTO PAL_RF_DATA_TBL VALUES ('Rain', 70, 96, 'No', 1.0);
DROP TABLE #PAL_CONTROL_TBL;
CREATE LOCAL TEMPORARY COLUMN TABLE #PAL_CONTROL_TBL(
"NAME" VARCHAR (100),
"INTARGS" INTEGER,
"DOUBLEARGS" DOUBLE,
"STRINGARGS" VARCHAR (100)
);
INSERT INTO #PAL_CONTROL_TBL VALUES ('TREES_NUM', 300, null,null);
INSERT INTO #PAL_CONTROL_TBL VALUES ('TRY_NUM', 3, null,null);
INSERT INTO #PAL_CONTROL_TBL VALUES ('SEED', 2, null,null);
DROP TABLE PAL_RF_MODEL_TBL;
CREATE COLUMN TABLE PAL_RF_MODEL_TBL LIKE PAL_RF_MODEL_T;
DROP TABLE PAL_RF_VAR_IMP_TBL;
CREATE COLUMN TABLE PAL_RF_VAR_IMP_TBL LIKE PAL_RF_VAR_IMP_T;
DROP TABLE PAL_RF_ERR_RATE_TBL;
CREATE COLUMN TABLE PAL_RF_ERR_RATE_TBL LIKE PAL_RF_ERR_RATE_T;
DROP TABLE PAL_RF_CONFUSION_TBL;
CREATE COLUMN TABLE PAL_RF_CONFUSION_TBL LIKE PAL_RF_CONFUSION_T;
CALL "DM_PAL".PAL_RF_TRAINING_PROC(PAL_RF_DATA_TBL, #PAL_CONTROL_TBL, PAL_RF_MODEL_TBL, PAL_RF_VAR_IMP_TBL, PAL_RF_ERR_RATE_TBL, PAL_RF_CONFUSION_TBL) WITH OVERVIEW;
SELECT * FROM PAL_RF_MODEL_TBL;
SELECT * FROM PAL_RF_VAR_IMP_TBL;
SELECT * FROM PAL_RF_ERR_RATE_TBL;
SELECT * FROM PAL_RF_CONFUSION_TBL;
-----------------------------------------
DROP TYPE PAL_RF_SCORING_DATA_T;
CREATE TYPE PAL_RF_SCORING_DATA_T AS TABLE(
"ID" INTEGER,
"OUTLOOK" VARCHAR(20),
"TEMP" INTEGER,
"HUMIDITY" DOUBLE,
"WINDY" VARCHAR(10)
);
DROP TYPE PAL_RF_SCORING_MODEL_T;
CREATE TYPE PAL_RF_SCORING_MODEL_T AS TABLE(
"ID" INTEGER,
"TREEINDEX" INTEGER,
"MODEL" VARCHAR(5000)
);
DROP TYPE PAL_RF_SCORING_RESULT_T;
CREATE TYPE PAL_RF_SCORING_RESULT_T AS TABLE("ID" INTEGER, "SCORING" VARCHAR(50), "PROB" DOUBLE);
DROP TYPE PAL_CONTROL_T;
CREATE TYPE PAL_CONTROL_T AS TABLE(
"NAME" VARCHAR(100),
"INTARGS" INTEGER,
"DOUBLEARGS" DOUBLE,
"STRINGARGS" VARCHAR(100)
);
DROP TABLE PAL_RF_SCORING_PDATA_TBL;
CREATE COLUMN TABLE PAL_RF_SCORING_PDATA_TBL (
"POSITION" INT,
"SCHEMA_NAME" NVARCHAR(256),
"TYPE_NAME" NVARCHAR(256),
"PARAMETER_TYPE" VARCHAR(7)
);
INSERT INTO PAL_RF_SCORING_PDATA_TBL VALUES (1, 'DM_PAL', 'PAL_RF_SCORING_DATA_T', 'IN');
INSERT INTO PAL_RF_SCORING_PDATA_TBL VALUES (2, 'DM_PAL', 'PAL_CONTROL_T', 'IN');
INSERT INTO PAL_RF_SCORING_PDATA_TBL VALUES (3, 'DM_PAL', 'PAL_RF_SCORING_MODEL_T', 'IN');
INSERT INTO PAL_RF_SCORING_PDATA_TBL VALUES (4, 'DM_PAL', 'PAL_RF_SCORING_RESULT_T', 'OUT');
CALL "SYS".AFLLANG_WRAPPER_PROCEDURE_DROP('DM_PAL', 'PAL_RF_SCORING_PROC');
CALL "SYS".AFLLANG_WRAPPER_PROCEDURE_CREATE('AFLPAL', 'RANDOMFORESTSCORING', 'DM_PAL', 'PAL_RF_SCORING_PROC', PAL_RF_SCORING_PDATA_TBL);
DROP TABLE #PAL_CONTROL_TBL;
CREATE LOCAL TEMPORARY COLUMN TABLE #PAL_CONTROL_TBL (
"NAME" VARCHAR(100),
"INTARGS" INTEGER,
"DOUBLEARGS" DOUBLE,
"STRINGARGS" VARCHAR(100)
);
INSERT INTO #PAL_CONTROL_TBL VALUES ('IS_OUTPUT_PROBABILITY', 0, null, null);
DROP TABLE PAL_RF_SCORING_DATA_TBL;
CREATE COLUMN TABLE PAL_RF_SCORING_DATA_TBL LIKE PAL_RF_SCORING_DATA_T;
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (0, 'Overcast', 75, -10000, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (1, 'Rain', 78, 70, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (2, 'Sunny', -10000, null, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (3, 'Sunny', 69, 70, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (4, 'Rain', null, 70, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (5, null, 70, 70, 'Yes');
INSERT INTO PAL_RF_SCORING_DATA_TBL VALUES (6, '***', 70, 70, 'Yes');
DROP TABLE PAL_RF_SCORING_RESULT_TBL;
CREATE COLUMN TABLE PAL_RF_SCORING_RESULT_TBL LIKE PAL_RF_SCORING_RESULT_T;
CALL "DM_PAL".PAL_RF_SCORING_PROC(PAL_RF_SCORING_DATA_TBL, #PAL_CONTROL_TBL, PAL_RF_MODEL_TBL, PAL_RF_SCORING_RESULT_TBL) with OVERVIEW;
SELECT * FROM PAL_RF_SCORING_RESULT_TBL;
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
Hi Xingtian,
thanks for your help. Example works great. Is it essential, that the last column of the training dataset is of type double? I tried it with type integer and definied this column as continous in the PAL_CONTROL_TBL. This however does not work.
We are on HANA SPS 11 Rev 03. Which version do you recommend?
Thanks a lot.
Frederic
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
84 | |
23 | |
11 | |
9 | |
8 | |
5 | |
5 | |
5 | |
5 | |
4 |
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.