processEmail.m 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. function word_indices = processEmail(email_contents)
  2. %PROCESSEMAIL preprocesses a the body of an email and
  3. %returns a list of word_indices
  4. % word_indices = PROCESSEMAIL(email_contents) preprocesses
  5. % the body of an email and returns a list of indices of the
  6. % words contained in the email.
  7. %
  8. % Load Vocabulary
  9. vocabList = getVocabList();
  10. % Init return value
  11. word_indices = [];
  12. % ========================== Preprocess Email ===========================
  13. % Find the Headers ( \n\n and remove )
  14. % Uncomment the following lines if you are working with raw emails with the
  15. % full headers
  16. % hdrstart = strfind(email_contents, ([char(10) char(10)]));
  17. % email_contents = email_contents(hdrstart(1):end);
  18. % Lower case
  19. email_contents = lower(email_contents);
  20. % Strip all HTML
  21. % Looks for any expression that starts with < and ends with > and replace
  22. % and does not have any < or > in the tag it with a space
  23. email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
  24. % Handle Numbers
  25. % Look for one or more characters between 0-9
  26. email_contents = regexprep(email_contents, '[0-9]+', 'number');
  27. % Handle URLS
  28. % Look for strings starting with http:// or https://
  29. email_contents = regexprep(email_contents, ...
  30. '(http|https)://[^\s]*', 'httpaddr');
  31. % Handle Email Addresses
  32. % Look for strings with @ in the middle
  33. email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
  34. % Handle $ sign
  35. email_contents = regexprep(email_contents, '[$]+', 'dollar');
  36. % ========================== Tokenize Email ===========================
  37. % Output the email to screen as well
  38. fprintf('\n==== Processed Email ====\n\n');
  39. % Process file
  40. l = 0;
  41. while ~isempty(email_contents)
  42. % Tokenize and also get rid of any punctuation
  43. [str, email_contents] = ...
  44. strtok(email_contents, ...
  45. [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
  46. % Remove any non alphanumeric characters
  47. str = regexprep(str, '[^a-zA-Z0-9]', '');
  48. % Stem the word
  49. % (the porterStemmer sometimes has issues, so we use a try catch block)
  50. try str = porterStemmer(strtrim(str));
  51. catch str = ''; continue;
  52. end;
  53. % Skip the word if it is too short
  54. if length(str) < 1
  55. continue;
  56. end
  57. % Look up the word in the dictionary and add to word_indices if
  58. % found
  59. % ====================== YOUR CODE HERE ======================
  60. % Instructions: Fill in this function to add the index of str to
  61. % word_indices if it is in the vocabulary. At this point
  62. % of the code, you have a stemmed word from the email in
  63. % the variable str. You should look up str in the
  64. % vocabulary list (vocabList). If a match exists, you
  65. % should add the index of the word to the word_indices
  66. % vector. Concretely, if str = 'action', then you should
  67. % look up the vocabulary list to find where in vocabList
  68. % 'action' appears. For example, if vocabList{18} =
  69. % 'action', then, you should add 18 to the word_indices
  70. % vector (e.g., word_indices = [word_indices ; 18]; ).
  71. %
  72. % Note: vocabList{idx} returns a the word with index idx in the
  73. % vocabulary list.
  74. %
  75. % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
  76. % str2). It will return 1 only if the two strings are equivalent.
  77. %
  78. for i = 1:length(vocabList)
  79. if strcmp(str, vocabList(i))
  80. word_indices = [word_indices; i];
  81. break;
  82. end
  83. end
  84. % =============================================================
  85. % Print to screen, ensuring that the output lines are not too long
  86. if (l + length(str) + 1) > 78
  87. fprintf('\n');
  88. l = 0;
  89. end
  90. fprintf('%s ', str);
  91. l = l + length(str) + 1;
  92. end
  93. % Print footer
  94. fprintf('\n\n=========================\n');
  95. end