123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- function word_indices = processEmail(email_contents)
- %PROCESSEMAIL preprocesses a the body of an email and
- %returns a list of word_indices
- % word_indices = PROCESSEMAIL(email_contents) preprocesses
- % the body of an email and returns a list of indices of the
- % words contained in the email.
- %
- % Load Vocabulary
- vocabList = getVocabList();
- % Init return value
- word_indices = [];
- % ========================== Preprocess Email ===========================
- % Find the Headers ( \n\n and remove )
- % Uncomment the following lines if you are working with raw emails with the
- % full headers
- % hdrstart = strfind(email_contents, ([char(10) char(10)]));
- % email_contents = email_contents(hdrstart(1):end);
- % Lower case
- email_contents = lower(email_contents);
- % Strip all HTML
- % Looks for any expression that starts with < and ends with > and replace
- % and does not have any < or > in the tag it with a space
- email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
- % Handle Numbers
- % Look for one or more characters between 0-9
- email_contents = regexprep(email_contents, '[0-9]+', 'number');
- % Handle URLS
- % Look for strings starting with http:// or https://
- email_contents = regexprep(email_contents, ...
- '(http|https)://[^\s]*', 'httpaddr');
- % Handle Email Addresses
- % Look for strings with @ in the middle
- email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
- % Handle $ sign
- email_contents = regexprep(email_contents, '[$]+', 'dollar');
- % ========================== Tokenize Email ===========================
- % Output the email to screen as well
- fprintf('\n==== Processed Email ====\n\n');
- % Process file
- l = 0;
- while ~isempty(email_contents)
- % Tokenize and also get rid of any punctuation
- [str, email_contents] = ...
- strtok(email_contents, ...
- [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
-
- % Remove any non alphanumeric characters
- str = regexprep(str, '[^a-zA-Z0-9]', '');
- % Stem the word
- % (the porterStemmer sometimes has issues, so we use a try catch block)
- try str = porterStemmer(strtrim(str));
- catch str = ''; continue;
- end;
- % Skip the word if it is too short
- if length(str) < 1
- continue;
- end
- % Look up the word in the dictionary and add to word_indices if
- % found
- % ====================== YOUR CODE HERE ======================
- % Instructions: Fill in this function to add the index of str to
- % word_indices if it is in the vocabulary. At this point
- % of the code, you have a stemmed word from the email in
- % the variable str. You should look up str in the
- % vocabulary list (vocabList). If a match exists, you
- % should add the index of the word to the word_indices
- % vector. Concretely, if str = 'action', then you should
- % look up the vocabulary list to find where in vocabList
- % 'action' appears. For example, if vocabList{18} =
- % 'action', then, you should add 18 to the word_indices
- % vector (e.g., word_indices = [word_indices ; 18]; ).
- %
- % Note: vocabList{idx} returns a the word with index idx in the
- % vocabulary list.
- %
- % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
- % str2). It will return 1 only if the two strings are equivalent.
- %
- for i = 1:length(vocabList)
- if strcmp(str, vocabList(i))
- word_indices = [word_indices; i];
- break;
- end
- end
- % =============================================================
- % Print to screen, ensuring that the output lines are not too long
- if (l + length(str) + 1) > 78
- fprintf('\n');
- l = 0;
- end
- fprintf('%s ', str);
- l = l + length(str) + 1;
- end
- % Print footer
- fprintf('\n\n=========================\n');
- end
|