import pandas as pd
# we want to show large text snippets to be able to explore the relevant text
pd.options.display.max_colwidth = 400

import json
import seaborn as sn
import matplotlib.pyplot as plt


import watson_nlp

from watson_core.data_model.streams.resolver import DataStreamResolver
from watson_core.toolkit import fileio
from watson_nlp.blocks.classification.svm import SVM


url = "https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/?date_received_max=2021-03-30&date_received_min=2021-02-28&field=all&format=csv&has_narrative=true&no_aggs=true&size=18102"


complaint_df = pd.read_csv(url)
text_col = 'Consumer complaint narrative'

complaint_df.head(2)


product_counts = complaint_df['Product'].value_counts()
product_counts

Credit reporting, credit repair services, or other personal consumer reports    8890
Debt collection                                                                 3340
Credit card or prepaid card                                                     1555
Mortgage                                                                        1548
Checking or savings account                                                     1122
Money transfer, virtual currency, or money service                               914
Vehicle loan or lease                                                            373
Student loan                                                                     193
Payday loan, title loan, or personal loan                                        193
Name: Product, dtype: int64


# number of complaints for the each product group
sample_size = 300

train_test_df = complaint_df.query("Product != 'Student loan' and Product != 'Payday loan, title loan, or personal loan'")

# sample the data to have the same number of complaints for each product group
train_test_df = train_test_df.groupby('Product').sample(n=sample_size, random_state=5).reset_index(drop=True)


# 80% training data
train_orig_df = train_test_df.groupby('Product').sample(frac=0.8, random_state=6)
print("Training data:")
print("Number of training samples: {}".format(len(train_orig_df)))
print("Samples by product group:\n{}".format(train_orig_df['Product'].value_counts()))

# 20% test data
test_orig_df = train_test_df.drop(train_orig_df.index)
print("\nTest data:")
print("Number of test samples: {}".format(len(test_orig_df)))
print("Samples by product group:\n{}".format(test_orig_df['Product'].value_counts()))

# re-index after sampling
train_orig_df = train_orig_df.reset_index(drop=True)
test_orig_df = test_orig_df.reset_index(drop=True)

Training data:
Number of training samples: 1680
Samples by product group:
Checking or savings account                                                     240
Credit card or prepaid card                                                     240
Credit reporting, credit repair services, or other personal consumer reports    240
Debt collection                                                                 240
Money transfer, virtual currency, or money service                              240
Mortgage                                                                        240
Vehicle loan or lease                                                           240
Name: Product, dtype: int64

Test data:
Number of test samples: 420
Samples by product group:
Checking or savings account                                                     60
Credit card or prepaid card                                                     60
Credit reporting, credit repair services, or other personal consumer reports    60
Debt collection                                                                 60
Money transfer, virtual currency, or money service                              60
Mortgage                                                                        60
Vehicle loan or lease                                                           60
Name: Product, dtype: int64


def prepare_data(df):
    # only the text column and the target label *Product* are needed
    df_out = df[[text_col, 'Product']].reset_index(drop=True)
    # rename to the identifiers expected by Watson NLP
    df_out = df_out.rename(columns={text_col: "text", 'Product': 'labels'})
    # the label column should be an array (although we have only one label per complaint)
    df_out['labels'] = df_out['labels'].map(lambda label: [label,])
    return df_out
    
train_df = prepare_data(train_orig_df)
train_file = './train_data.json'
train_df.to_json(train_file, orient='records')
    
test_df = prepare_data(test_orig_df)
test_file = './test_data.json'
test_df.to_json(test_file, orient='records')

train_df.head(2)


# Syntax Model
syntax_model = watson_nlp.load('syntax_izumo_en_stock')
# USE Embedding Model
use_model = watson_nlp.load('embedding_use_en_stock')


training_data_file = train_file

# Create datastream from training data
data_stream_resolver = DataStreamResolver(target_stream_type=list, expected_keys={'text': str, 'labels': list})
training_data = data_stream_resolver.as_data_stream(training_data_file)

# Create Syntax stream
text_stream, labels_stream = training_data[0], training_data[1]
syntax_stream = syntax_model.stream(text_stream)

use_train_stream = use_model.stream(syntax_stream, doc_embed_style='raw_text')
use_svm_train_stream = watson_nlp.data_model.DataStream.zip(use_train_stream, labels_stream)


# Train the SVM model
svm_model = SVM.train(use_svm_train_stream)


from watson_nlp.workflows.classification import GenericEnsemble
from watson_nlp.workflows.classification.base_classifier import TFidfSvm
from watson_nlp.workflows.classification.base_classifier import UseSvm

ensemble_model = GenericEnsemble.train(training_data, syntax_model, 
                                       base_classifiers_params=[
                                           TFidfSvm.TrainParams(syntax_model=syntax_model),
                                           UseSvm.TrainParams(syntax_model=syntax_model, use_embedding_model=use_model, doc_embed_style='raw_text')])


wslib.save_data('svm_model', data=svm_model.as_bytes(), overwrite=True)

{'name': 'svm_model',
 'asset_type': 'data_asset',
 'asset_id': '4325944d-9975-456c-a3dc-30b148688069',
 'attachment_id': 'f3dc9856-bb58-469d-8c89-477d39f67f4f',
 'filepath': 'svm_model.',
 'data_size': 38778,
 'mime': 'application/binary',
 'summary': ['looked up asset',
  'selected attachment',
  'overwritten file',
  'updated attachment'],
 'access_count': 1}


wslib.save_data('ensemble_model', data=ensemble_model.as_bytes(), overwrite=True)

{'name': 'ensemble_model',
 'asset_type': 'data_asset',
 'asset_id': '43cb9ca3-a2fa-4aca-bdbf-92c753f895c8',
 'attachment_id': '64fe3d95-1d6d-4a1c-9667-3034bc525b41',
 'filepath': 'ensemble_model.',
 'data_size': 962178135,
 'mime': 'application/binary',
 'summary': ['looked up asset',
  'selected attachment',
  'overwritten file',
  'updated attachment'],
 'access_count': 1}


svm_model = watson_nlp.load(wslib.load_data('svm_model'))


ensemble_model = watson_nlp.load(wslib.load_data('ensemble_model'))


def predict_product(text):
    # run syntax model first
    syntax_result = syntax_model.run(text)
    # run SVM model on top of syntax result
    svm_preds = svm_model.run(use_model.run(syntax_result, doc_embed_style='raw_text'))
    
    predicted_svm = svm_preds.to_dict()["classes"][0]["class_name"]
    
    ensemble_preds = ensemble_model.run(text)
    predicted_ensemble = ensemble_preds.to_dict()["classes"][0]["class_name"]
    return (predicted_svm, predicted_ensemble)


predictions = test_orig_df[text_col].apply(lambda text: predict_product(text))
predictions_df = pd.DataFrame.from_records(predictions, columns=('Predicted SVM', 'Predicted Ensemble'))
   
result_df = test_orig_df[[text_col, "Product"]].merge(predictions_df, how='left', left_index=True, right_index=True)
result_df.head()


# run the models on the test data - this time in batch mode
preprocess_func = lambda raw_doc: use_model.run_batch(syntax_model.run_batch(raw_doc))
svm_model.evaluate_quality(test_file, preprocess_func=preprocess_func)

WARNING: Only Micro_avg metrics could be calculated based on the information available for this block type.

{'per_class_confusion_matrix': {'Checking or savings account': {'true_positive': 27,
   'false_positive': 127,
   'false_negative': 33,
   'precision': 0.17532467532467533,
   'recall': 0.45,
   'f1': 0.25233644859813087},
  'Debt collection': {'true_positive': 11,
   'false_positive': 102,
   'false_negative': 49,
   'precision': 0.09734513274336283,
   'recall': 0.18333333333333332,
   'f1': 0.12716763005780346},
  'Credit reporting, credit repair services, or other personal consumer reports': {'true_positive': 6,
   'false_positive': 36,
   'false_negative': 54,
   'precision': 0.14285714285714285,
   'recall': 0.1,
   'f1': 0.11764705882352941},
  'Credit card or prepaid card': {'true_positive': 9,
   'false_positive': 45,
   'false_negative': 51,
   'precision': 0.16666666666666666,
   'recall': 0.15,
   'f1': 0.15789473684210525},
  'Money transfer, virtual currency, or money service': {'true_positive': 4,
   'false_positive': 38,
   'false_negative': 56,
   'precision': 0.09523809523809523,
   'recall': 0.06666666666666667,
   'f1': 0.0784313725490196},
  'Vehicle loan or lease': {'true_positive': 3,
   'false_positive': 9,
   'false_negative': 57,
   'precision': 0.25,
   'recall': 0.05,
   'f1': 0.08333333333333334},
  'Mortgage': {'true_positive': 0,
   'false_positive': 3,
   'false_negative': 60,
   'precision': 0.0,
   'recall': 0.0,
   'f1': 0.0}},
 'macro_true_positive': None,
 'macro_false_positive': None,
 'macro_false_negative': None,
 'macro_precision': 0.13249024468999185,
 'macro_recall': 0.14285714285714285,
 'macro_f1': 0.11668722574341742,
 'micro_precision': 0.14285714285714285,
 'micro_recall': 0.14285714285714285,
 'micro_f1': 0.14285714285714285,
 'overall_tp': 60,
 'overall_fp': 360,
 'overall_fn': 360,
 'detailed_metrics': [],
 'micro_precision_partial_match': 0.0,
 'micro_recall_partial_match': 0.0,
 'micro_f1_partial_match': 0.0}


SVM_confusion_df = pd.crosstab(result_df['Product'], result_df['Predicted SVM'], rownames=['Actual'], normalize='index')
ensemble_confusion_df = pd.crosstab(result_df['Product'], result_df['Predicted Ensemble'], rownames=['Actual'], normalize='index')

figure, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,7))

sn.heatmap(SVM_confusion_df, annot=True, cmap="YlGnBu", ax=ax1, cbar=False)
sn.heatmap(ensemble_confusion_df, annot=True, cmap="YlGnBu", ax=ax2, cbar=False)
ax1.title.set_text("SVM")
ax2.title.set_text("Ensemble")
ax2.set_yticklabels([])

plt.show()

	Date received	Product	Sub-product	Issue	Sub-issue	Consumer complaint narrative	Company public response	Company	State	ZIP code	Tags	Consumer consent provided?	Submitted via	Date sent to company	Company response to consumer	Timely response?	Consumer disputed?	Complaint ID
0	03/12/21	Debt collection	Other debt	Communication tactics	Frequent or repeated calls	Sunset Finance called from one of their phone numbers ( mainline ) and then immediately called back from a secondary line before I could even respond to the first call. This tactic is deceptive in nature and harassing in the fact that the calls are back to back with no reasonable time between them. When I returned the call, the representative became verbally abusive and argumentative stating t...	None	Sunset Management, Inc	GA	30134	None	Consent provided	Web	03/12/21	Closed with explanation	Yes	NaN	4210022
1	03/12/21	Debt collection	Medical debt	Took or threatened to take negative or legal action	Collected or attempted to collect exempt funds	Hi, I am writing this email regarding a court case that was defaulted against me. I am a combat veteran who has been in 5 combat deployments and was blown up and shot. I suffered significant and life long disabilities. I have undergone over XXXX surgeries. I am honorably discharged and seek medical through the VA. \nIn XX/XX/2017, I suffered a XXXX XXXX XXXX and lost all XXXX in my XXXX. I cou...	Company disputes the facts presented in the complaint	California Business Bureau, Inc.	CA	92563	Servicemember	Consent provided	Web	03/12/21	Closed with explanation	Yes	NaN	4208052

	text	labels
0	I received notification of several withdrawals in my account to XXXX. Two where automatically refunded than taken again. For a total of {$3800.00} I attempted to file fraud on the transactions and they closed my account. Since the account was closed I have no way to contact anyone at the bank about the fraudulent charges. Only XXXX was included in the fraud. Another {$2000.00} to XXXX was made...	[Checking or savings account]
1	I am not able to connect my external bank account to the savings account, i emailed credit karma support on XX/XX/XXXX and they replied back within minutes but they have not resolved my issue and now they wont reply back to any other emails I send. I have funds in this account and need access to it, Ive suggested they send me a check for my funds but Ive got no reply back, we are in the middle...	[Checking or savings account]

Classifying customer complaints with Watson NLP¶

What you'll learn in this notebook¶

Table of Contents¶

Before you start¶

Load the complaint data¶

Prepare training and test data¶

Train a SVM classification model with Watson NLP¶

Train an ensemble classification model with Watson NLP¶

Store and load classification models (optional)¶

Classify test data and compare model quality¶

Out-of-the-box model evaluation using Watson NLP¶

Creating and plotting a confusion matrix¶

Findings¶

Summary¶

Authors¶

	Consumer complaint narrative	Product	Predicted SVM	Predicted Ensemble
0	On XXXX, went to bank of America to close joint account, because of fraud, id theft. I asked to be added as power of attorney, to speak on my mother behalf, instead, the service rep removed me from the account which is mine, because, I opened it, and my mother placed me on the account, in 2014. Id theft was for {$680.00} at XXXX. This behavior is illegal, they can not remove me off the account...	Checking or savings account	Checking or savings account	Checking or savings account
1	The supervisor Mrs.Bille seen two transactions that were held a date late and didn't reconcile the issue with my deposit XX/XX/XXXX and Today the XXXX I asked reasonable questions and she hung up in my face as if I haven't been an trustworthy and honest customer for more than 6 years. I will contact the BBB Because u just treat people like that in such I disgrace manner ... now I'm homeless du...	Checking or savings account	Checking or savings account	Credit card or prepaid card
2	USAA Bank Fraud XXXX XXXX account Impersonating representatives accessing account stating account closed. \n\nCheck mailed for remainder of account. \n\nNo verification of account closed. Letter from USSA state changes not closure to account. \n\nAlso current screenshot current status of truck claim fraud. \n\nXXXX XXXX XXXX XXXX XXXX XXXX XXXX, NC XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX ...	Checking or savings account	Checking or savings account	Checking or savings account
3	On XX/XX/XXXX I was a victim of fraud when 3 transactions were made in my name without my authorization, they were for XXXX and XXXX dollars, I knew it because the bank sent me a voice message, when I called them they explained the situation to me, they blocked my debit card and they started the investigation.On XX/XX/XXXX they told me that everything indicated that I had made these transactio...	Checking or savings account	Money transfer, virtual currency, or money service	Checking or savings account
4	This is in reference to chase claim number XXXX Fraudulent account with joint tenancy was opened sometimes in XX/XX/2021. Unauthorized transfers from Chase checking account, ending in XXXX were made between XX/XX/2021 to XX/XX/2021, in the total amount of {$140000.00}, to an account opened by fraudsters. \n\nChase bank refused to assist in recover the stolen funds, after police report/identity...	Checking or savings account	Checking or savings account	Checking or savings account