ex6_spam.m 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. %% Machine Learning Online Class
  2. % Exercise 6 | Spam Classification with SVMs
  3. %
  4. % Instructions
  5. % ------------
  6. %
  7. % This file contains code that helps you get started on the
  8. % exercise. You will need to complete the following functions:
  9. %
  10. % gaussianKernel.m
  11. % dataset3Params.m
  12. % processEmail.m
  13. % emailFeatures.m
  14. %
  15. % For this exercise, you will not need to change any code in this file,
  16. % or any other files other than those mentioned above.
  17. %
  18. %% Initialization
  19. clear ; close all; clc
  20. %% ==================== Part 1: Email Preprocessing ====================
  21. % To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
  22. % to convert each email into a vector of features. In this part, you will
  23. % implement the preprocessing steps for each email. You should
  24. % complete the code in processEmail.m to produce a word indices vector
  25. % for a given email.
  26. fprintf('\nPreprocessing sample email (emailSample1.txt)\n');
  27. % Extract Features
  28. file_contents = readFile('emailSample1.txt');
  29. word_indices = processEmail(file_contents);
  30. % Print Stats
  31. fprintf('Word Indices: \n');
  32. fprintf(' %d', word_indices);
  33. fprintf('\n\n');
  34. fprintf('Program paused. Press enter to continue.\n');
  35. pause;
  36. %% ==================== Part 2: Feature Extraction ====================
  37. % Now, you will convert each email into a vector of features in R^n.
  38. % You should complete the code in emailFeatures.m to produce a feature
  39. % vector for a given email.
  40. fprintf('\nExtracting features from sample email (emailSample1.txt)\n');
  41. % Extract Features
  42. file_contents = readFile('emailSample1.txt');
  43. word_indices = processEmail(file_contents);
  44. features = emailFeatures(word_indices);
  45. % Print Stats
  46. fprintf('Length of feature vector: %d\n', length(features));
  47. fprintf('Number of non-zero entries: %d\n', sum(features > 0));
  48. fprintf('Program paused. Press enter to continue.\n');
  49. pause;
  50. %% =========== Part 3: Train Linear SVM for Spam Classification ========
  51. % In this section, you will train a linear classifier to determine if an
  52. % email is Spam or Not-Spam.
  53. % Load the Spam Email dataset
  54. % You will have X, y in your environment
  55. load('spamTrain.mat');
  56. fprintf('\nTraining Linear SVM (Spam Classification)\n')
  57. fprintf('(this may take 1 to 2 minutes) ...\n')
  58. C = 0.1;
  59. model = svmTrain(X, y, C, @linearKernel);
  60. p = svmPredict(model, X);
  61. fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);
  62. %% =================== Part 4: Test Spam Classification ================
  63. % After training the classifier, we can evaluate it on a test set. We have
  64. % included a test set in spamTest.mat
  65. % Load the test dataset
  66. % You will have Xtest, ytest in your environment
  67. load('spamTest.mat');
  68. fprintf('\nEvaluating the trained Linear SVM on a test set ...\n')
  69. p = svmPredict(model, Xtest);
  70. fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);
  71. pause;
  72. %% ================= Part 5: Top Predictors of Spam ====================
  73. % Since the model we are training is a linear SVM, we can inspect the
  74. % weights learned by the model to understand better how it is determining
  75. % whether an email is spam or not. The following code finds the words with
  76. % the highest weights in the classifier. Informally, the classifier
  77. % 'thinks' that these words are the most likely indicators of spam.
  78. %
  79. % Sort the weights and obtin the vocabulary list
  80. [weight, idx] = sort(model.w, 'descend');
  81. vocabList = getVocabList();
  82. fprintf('\nTop predictors of spam: \n');
  83. for i = 1:15
  84. fprintf(' %-15s (%f) \n', vocabList{idx(i)}, weight(i));
  85. end
  86. fprintf('\n\n');
  87. fprintf('\nProgram paused. Press enter to continue.\n');
  88. pause;
  89. %% =================== Part 6: Try Your Own Emails =====================
  90. % Now that you've trained the spam classifier, you can use it on your own
  91. % emails! In the starter code, we have included spamSample1.txt,
  92. % spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
  93. % The following code reads in one of these emails and then uses your
  94. % learned SVM classifier to determine whether the email is Spam or
  95. % Not Spam
  96. % Set the file to be read in (change this to spamSample2.txt,
  97. % emailSample1.txt or emailSample2.txt to see different predictions on
  98. % different emails types). Try your own emails as well!
  99. filename = 'spamSample1.txt';
  100. % Read and predict
  101. file_contents = readFile(filename);
  102. word_indices = processEmail(file_contents);
  103. x = emailFeatures(word_indices);
  104. p = svmPredict(model, x);
  105. fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
  106. fprintf('(1 indicates spam, 0 indicates not spam)\n\n');