api_key = 'PASTE YOUR PLATFORM API KEY HERE'
location = 'PASTE YOUR INSTANCE LOCATION HERE'


wml_credentials = {
    "apikey": api_key,
    "url": 'https://' + location + '.ml.cloud.ibm.com'
}


!rm -rf /home/spark/shared/user-libs/python3.10*


!pip install -U --user ibm-watson-machine-learning


from ibm_watson_machine_learning import APIClient

client = APIClient(wml_credentials)


space_id = 'PASTE YOUR SPACE ID HERE'


client.spaces.list(limit=10)


client.set.default_space(space_id)


try:
    from pyspark.sql import SparkSession
except:
    print('Error: Spark runtime is missing. If you are using Watson Studio change the notebook runtime to Spark.')
    raise


!pip install wget


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
filename = os.path.join(sample_dir, 'credit_risk_training.csv')
if not os.path.isfile(filename):
    filename = download('https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/credit_risk/credit_risk_training.csv', out=sample_dir)


spark = SparkSession.builder.getOrCreate()

df_data = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load(filename)


df_data.printSchema()


df_data.show(n=5, truncate=False, vertical=True)


print("Number of records: " + str(df_data.count()))


splitted_data = df_data.randomSplit([0.8, 0.18, 0.02], 24)
train_data = splitted_data[0]
test_data = splitted_data[1]
predict_data = splitted_data[2]

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))
print("Number of prediction records : " + str(predict_data.count()))


import ibm_boto3
from ibm_botocore.client import Config


cos_credentials = {
PASTE YOUR COS CREDENTIALS HERE
}

# example:
# cos_credentials = {
#   "apikey": "***",
#   "cos_hmac_keys": {
#     "access_key_id": "***",
#     "secret_access_key": "***"
#   },
#   "endpoints": "https://cos-service.bluemix.net/endpoints",
#   "iam_apikey_description": "***",
#   "iam_apikey_name": "***",
#   "iam_role_crn": "crn:v1:bluemix:public:iam::::serviceRole:Writer",
#   "iam_serviceid_crn": "***",
#   "resource_instance_id": "***"
# }


connection_apikey = cos_credentials['apikey']
connection_resource_instance_id = cos_credentials["resource_instance_id"]
connection_access_key_id = cos_credentials['cos_hmac_keys']['access_key_id']
connection_secret_access_key = cos_credentials['cos_hmac_keys']['secret_access_key']


service_endpoint = 'https://s3.us.cloud-object-storage.appdomain.cloud'


auth_endpoint = 'https://iam.cloud.ibm.com/identity/token'


cos = ibm_boto3.resource('s3',
                         ibm_api_key_id=cos_credentials['apikey'],
                         ibm_service_instance_id=cos_credentials['resource_instance_id'],
                         ibm_auth_endpoint=auth_endpoint,
                         config=Config(signature_version='oauth'),
                         endpoint_url=service_endpoint)


from uuid import uuid4

bucket_uid = str(uuid4())

score_filename = "credit_risk_training.csv"
buckets = ["credit-risk-" + bucket_uid]


for bucket in buckets:
    if not cos.Bucket(bucket) in cos.buckets.all():
        print('Creating bucket "{}"...'.format(bucket))
        try:
            cos.create_bucket(Bucket=bucket)
        except ibm_boto3.exceptions.ibm_botocore.client.ClientError as e:
            print('Error: {}.'.format(e.response['Error']['Message']))


bucket_obj = cos.Bucket(buckets[0])

print('Uploading data {}...'.format(score_filename))
with open(filename, 'rb') as f:
    bucket_obj.upload_fileobj(f, score_filename)
print('{} is uploaded.'.format(score_filename))


datasource_type = client.connections.get_datasource_type_uid_by_name('bluemixcloudobjectstorage')

conn_meta_props= {
    client.connections.ConfigurationMetaNames.NAME: "COS connection - spark",
    client.connections.ConfigurationMetaNames.DATASOURCE_TYPE: datasource_type,
    client.connections.ConfigurationMetaNames.PROPERTIES: {
        'bucket': buckets[0],
        'access_key': connection_access_key_id,
        'secret_key': connection_secret_access_key,
        'iam_url': auth_endpoint,
        'url': service_endpoint
    }
}

conn_details = client.connections.create(meta_props=conn_meta_props)


connection_id = client.connections.get_uid(conn_details)


import os
from wget import download

sample_dir = 'spark_sample_model'
if not os.path.isdir(sample_dir):
    os.mkdir(sample_dir)
    
pipeline_filename = os.path.join(sample_dir, 'credit_risk_spark_pipeline.tar.gz')
if not os.path.isfile(pipeline_filename):
    pipeline_filename = download('https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/models/spark/credit-risk/model/credit_risk_spark_pipeline.tar.gz', out=sample_dir)
model_filename = os.path.join(sample_dir, 'credit_risk_spark_model.gz')
if not os.path.isfile(model_filename):
    model_filename = download('https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/models/spark/credit-risk/model/credit_risk_spark_model.gz', out=sample_dir)


training_data_references = [
                {
                    "type": "connection_asset",
                    "connection": {
                        "id": connection_id,
                    },
                    "location": {
                        "bucket": buckets[0],
                        "file_name": score_filename,
                    },
                    "schema": {
                    "id": "training_schema",
                    "fields": [
                      {
                        "metadata": {},
                        "name": "CheckingStatus",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "LoanDuration",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "CreditHistory",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "LoanPurpose",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "LoanAmount",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "ExistingSavings",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "EmploymentDuration",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "InstallmentPercent",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "Sex",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "OthersOnLoan",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "CurrentResidenceDuration",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "OwnsProperty",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "Age",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "InstallmentPlans",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "Housing",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "ExistingCreditsCount",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "Job",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "Dependents",
                        "nullable": True,
                        "type": "integer"
                      },
                      {
                        "metadata": {},
                        "name": "Telephone",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {},
                        "name": "ForeignWorker",
                        "nullable": True,
                        "type": "string"
                      },
                      {
                        "metadata": {
                          "modeling_role": "target"
                        },
                        "name": "Risk",
                        "nullable": True,
                        "type": "string"
                      }
                    ]
                  }
                }
            ]


published_model_details = client.repository.store_model(
    model=model_filename, 
    meta_props={
        client.repository.ModelMetaNames.NAME:'Credit Risk model',
        client.repository.ModelMetaNames.TYPE: "mllib_3.3",
        client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: client.software_specifications.get_id_by_name('spark-mllib_3.3'),
        client.repository.ModelMetaNames.TRAINING_DATA_REFERENCES: training_data_references,
        client.repository.ModelMetaNames.LABEL_FIELD: "Risk",
    }, 
    training_data=train_data, 
    pipeline=pipeline_filename)


model_id = client.repository.get_model_id(published_model_details)
print(model_id)


client.repository.get_model_details(model_id)


client.repository.ModelMetaNames.show()


loaded_model = client.repository.load(model_id)


print(type(loaded_model))


predictions = loaded_model.transform(predict_data)


predictions.show(5, vertical=True)


predictions.select("predictedLabel").groupBy("predictedLabel").count().show(truncate=False)


deployment_details = client.deployments.create(
    model_id, 
    meta_props={
        client.deployments.ConfigurationMetaNames.NAME: "Credit Risk model deployment",
        client.deployments.ConfigurationMetaNames.ONLINE: {}
    }
)


deployment_details


fields = ["CheckingStatus", "LoanDuration", "CreditHistory", "LoanPurpose", "LoanAmount", "ExistingSavings",
                  "EmploymentDuration", "InstallmentPercent", "Sex", "OthersOnLoan", "CurrentResidenceDuration",
                  "OwnsProperty", "Age", "InstallmentPlans", "Housing", "ExistingCreditsCount", "Job", "Dependents",
                  "Telephone", "ForeignWorker"]
values = [
    ["no_checking", 13, "credits_paid_to_date", "car_new", 1343, "100_to_500", "1_to_4", 2, "female", "none", 3,
     "savings_insurance", 46, "none", "own", 2, "skilled", 1, "none", "yes"],
    ["no_checking", 24, "prior_payments_delayed", "furniture", 4567, "500_to_1000", "1_to_4", 4, "male", "none",
     4, "savings_insurance", 36, "none", "free", 2, "management_self-employed", 1, "none", "yes"],
    ["0_to_200", 26, "all_credits_paid_back", "car_new", 863, "less_100", "less_1", 2, "female", "co-applicant",
     2, "real_estate", 38, "none", "own", 1, "skilled", 1, "none", "yes"],
    ["0_to_200", 14, "no_credits", "car_new", 2368, "less_100", "1_to_4", 3, "female", "none", 3, "real_estate",
     29, "none", "own", 1, "skilled", 1, "none", "yes"],
    ["0_to_200", 4, "no_credits", "car_new", 250, "less_100", "unemployed", 2, "female", "none", 3,
     "real_estate", 23, "none", "rent", 1, "management_self-employed", 1, "none", "yes"],
    ["no_checking", 17, "credits_paid_to_date", "car_new", 832, "100_to_500", "1_to_4", 2, "male", "none", 2,
     "real_estate", 42, "none", "own", 1, "skilled", 1, "none", "yes"],
    ["no_checking", 33, "outstanding_credit", "appliances", 5696, "unknown", "greater_7", 4, "male",
     "co-applicant", 4, "unknown", 54, "none", "free", 2, "skilled", 1, "yes", "yes"],
    ["0_to_200", 13, "prior_payments_delayed", "retraining", 1375, "100_to_500", "4_to_7", 3, "male", "none", 3,
     "real_estate", 37, "none", "own", 2, "management_self-employed", 1, "none", "yes"]
]

payload_scoring = {"input_data": [{"fields": fields, "values": values}]}
deployment_id = client.deployments.get_id(deployment_details)

client.deployments.score(deployment_id, payload_scoring)

Use Spark to predict credit risk with `ibm-watson-machine-learning`¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the `ibm-watson-machine-learning` package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

2.1 Prepare data¶

3. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

3.1: Save pipeline and model¶

3.2: Load model¶

4. Predict locally¶

4.1: Make local prediction using previously loaded model and test data¶

5. Deploy and score in a Cloud¶

5.1: Create online scoring endpoint¶

Create online deployment for published model¶

6. Clean up¶

7. Summary and next steps¶

Authors¶

Use Spark to predict credit risk with ibm-watson-machine-learning¶

Learning goals¶

Contents¶

1. Set up the environment¶

Connection to WML¶

Install and import the ibm-watson-machine-learning package¶

Working with spaces¶

Test Spark¶

2. Load and explore data¶

2.1 Prepare data¶

3. Persist model¶

Save training data in your Cloud Object Storage¶

Create connections to a COS bucket¶

3.1: Save pipeline and model¶

3.2: Load model¶

4. Predict locally¶

4.1: Make local prediction using previously loaded model and test data¶

5. Deploy and score in a Cloud¶

5.1: Create online scoring endpoint¶

Create online deployment for published model¶

6. Clean up¶

7. Summary and next steps¶

Authors¶

Use Spark to predict credit risk with `ibm-watson-machine-learning`¶

Install and import the `ibm-watson-machine-learning` package¶