[2]:
import pandas as pd
from scipy.io import arff
from rulekit import RuleKit
from rulekit.survival import SurvivalRules
from rulekit.params import Measures
from rulexai.explainer import RuleExplainer
GBSG2¶
Read data¶
[3]:
dataset_path = "./data/GBSG2.arff"
data = pd.DataFrame(arff.loadarff(dataset_path)[0])
# code to change encoding of the file
tmp_df = data.select_dtypes([object])
tmp_df = tmp_df.stack().str.decode("utf-8").unstack()
for col in tmp_df:
data[col] = tmp_df[col].replace({"?": None})
x = data.drop(["survival_status"], axis=1)
y = data["survival_status"]
Train RuleKit model¶
[4]:
# RuleKit
RuleKit.init()
srv = SurvivalRules(survival_time_attr="survival_time")
srv.fit(values=x, labels=y)
[4]:
<rulekit.survival.SurvivalRules at 0x176db91a880>
Rules¶
[5]:
for rule in srv.model.rules:
print(rule, rule.stats)
IF pnodes = (-inf, 3.50) THEN survival_status = {NaN} (p = 304.0, n = 0.0, P = 564.0, N = 0.0, weight = 0.9999999999999998, pvalue = 2.220446049250313e-16)
IF pnodes = (-inf, 17.50) AND progrec = (-inf, 9.50) AND age = <41.50, 52.50) AND estrec = <0.50, 29) THEN survival_status = {NaN} (p = 21.0, n = 0.0, P = 564.0, N = 0.0, weight = 0.9999999999909083, pvalue = 9.09172737095787e-12)
IF pnodes = <4.50, 19) AND progrec = (-inf, 11.50) AND age = <41.50, 64.50) AND estrec = <0.50, 41) THEN survival_status = {NaN} (p = 33.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF pnodes = <4.50, inf) AND progrec = (-inf, 25.50) THEN survival_status = {NaN} (p = 113.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF pnodes = <4.50, inf) AND progrec = (-inf, 99) THEN survival_status = {NaN} (p = 156.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF pnodes = <5.50, inf) AND progrec = (-inf, 135) THEN survival_status = {NaN} (p = 144.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF pnodes = <4.50, inf) AND progrec = (-inf, 233) THEN survival_status = {NaN} (p = 185.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF pnodes = (-inf, 4.50) AND progrec = <9, inf) AND age = <39.50, inf) THEN survival_status = {NaN} (p = 245.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
IF progrec = <107, inf) THEN survival_status = {NaN} (p = 168.0, n = 0.0, P = 564.0, N = 0.0, weight = 0.9999999989621143, pvalue = 1.0378856662995872e-09)
IF pnodes = <3.50, inf) AND progrec = (-inf, 105.50) THEN survival_status = {NaN} (p = 195.0, n = 0.0, P = 564.0, N = 0.0, weight = 1.0, pvalue = 0.0)
RuleXAI¶
[6]:
explainer = RuleExplainer(model=srv, X=x, y=y, type="survival")
explainer.explain()
[6]:
<rulexai.explainer.RuleExplainer at 0x176db937700>
Feature importance¶
[7]:
explainer.feature_importances_
[7]:
attributes | importances | |
---|---|---|
2 | pnodes | 460.222804 |
3 | progrec | 251.499862 |
0 | age | 20.523849 |
1 | estrec | 13.347720 |
Condition importance¶
[8]:
explainer.condition_importances_
[8]:
conditions | importances | |
---|---|---|
0 | pnodes = <4.5, inf) | 207.268572 |
1 | pnodes = (-inf, 3.5) | 67.394775 |
2 | pnodes = <5.5, inf) | 64.254026 |
3 | pnodes = <3.5, inf) | 64.104973 |
4 | progrec = (-inf, 25.5) | 48.923100 |
5 | progrec = <107.0, inf) | 37.252374 |
6 | progrec = (-inf, 105.5) | 33.962572 |
7 | progrec = (-inf, 99.0) | 33.423755 |
8 | pnodes = (-inf, 4.5) | 32.835122 |
9 | progrec = (-inf, 135.0) | 25.353218 |
10 | progrec = (-inf, 11.5) | 23.663185 |
11 | progrec = (-inf, 9.5) | 23.506762 |
12 | pnodes = <4.5, 19.0) | 18.150272 |
13 | progrec = <9.0, inf) | 13.146344 |
14 | progrec = (-inf, 233.0) | 12.268552 |
15 | estrec = <0.5, 29.0) | 10.450381 |
16 | age = <41.5, 64.5) | 9.275232 |
17 | age = <41.5, 52.5) | 8.077389 |
18 | pnodes = (-inf, 17.5) | 6.215064 |
19 | age = <39.5, inf) | 3.171229 |
20 | estrec = <0.5, 41.0) | 2.897339 |
Local explainability¶
[9]:
explainer.local_explainability(x.iloc[0, :], pd.DataFrame(y).iloc[0, :], plot = True)
Example:
horTh no
age 70.0
menostat Post
tsize 21.0
tgrade II
pnodes 3.0
progrec 48.0
estrec 66.0
survival_time 1814.0
survival_status 1.0
Name: 0, dtype: object
Rules that covers this example:
IF pnodes = (-inf, 3.5) THEN survival_status = {NaN}
IF pnodes = (-inf, 4.5) AND progrec = <9.0, inf) AND age = <39.5, inf) THEN survival_status = {NaN}
Importances of the conditions from rules covering the example
conditions importances
0 pnodes = (-inf, 3.5) 67.394775
1 pnodes = (-inf, 4.5) 32.835122
2 progrec = <9.0, inf) 13.146344
3 age = <39.5, inf) 3.171229
[9]:
conditions | importances | |
---|---|---|
0 | pnodes = (-inf, 3.5) | 67.394775 |
1 | pnodes = (-inf, 4.5) | 32.835122 |
2 | progrec = <9.0, inf) | 13.146344 |
3 | age = <39.5, inf) | 3.171229 |