cyh
/
coursera_machine_learning_ex


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
							function stem = porterStemmer(inString)
% Applies the Porter Stemming algorithm as presented in the following
% paper:
% Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
%   no. 3, pp 130-137

% Original code modeled after the C version provided at:
% http://www.tartarus.org/~martin/PorterStemmer/c.txt

% The main part of the stemming algorithm starts here. b is an array of
% characters, holding the word to be stemmed. The letters are in b[k0],
% b[k0+1] ending at b[k]. In fact k0 = 1 in this demo program (since
% matlab begins indexing by 1 instead of 0). k is readjusted downwards as
% the stemming progresses. Zero termination is not in fact used in the
% algorithm.

% To call this function, use the string to be stemmed as the input
% argument.  This function returns the stemmed word as a string.

% Lower-case string
inString = lower(inString);

global j;
b = inString;
k = length(b);
k0 = 1;
j = k;


% With this if statement, strings of length 1 or 2 don't go through the
% stemming process. Remove this conditional to match the published
% algorithm.
stem = b;
if k > 2
    % Output displays per step are commented out.
    %disp(sprintf('Word to stem: %s', b));
    x = step1ab(b, k, k0);
    %disp(sprintf('Steps 1A and B yield: %s', x{1}));
    x = step1c(x{1}, x{2}, k0);
    %disp(sprintf('Step 1C yields: %s', x{1}));
    x = step2(x{1}, x{2}, k0);
    %disp(sprintf('Step 2 yields: %s', x{1}));
    x = step3(x{1}, x{2}, k0);
    %disp(sprintf('Step 3 yields: %s', x{1}));
    x = step4(x{1}, x{2}, k0);
    %disp(sprintf('Step 4 yields: %s', x{1}));
    x = step5(x{1}, x{2}, k0);
    %disp(sprintf('Step 5 yields: %s', x{1}));
    stem = x{1};
end

% cons(j) is TRUE <=> b[j] is a consonant.
function c = cons(i, b, k0)
c = true;
switch(b(i))
    case {'a', 'e', 'i', 'o', 'u'}
        c = false;
    case 'y'
        if i == k0
            c = true;
        else
            c = ~cons(i - 1, b, k0);
        end
end

% mseq() measures the number of consonant sequences between k0 and j.  If
% c is a consonant sequence and v a vowel sequence, and <..> indicates
% arbitrary presence,

%      <c><v>       gives 0
%      <c>vc<v>     gives 1
%      <c>vcvc<v>   gives 2
%      <c>vcvcvc<v> gives 3
%      ....
function n = measure(b, k0)
global j;
n = 0;
i = k0;
while true
    if i > j
        return
    end
    if ~cons(i, b, k0)
        break;
    end
    i = i + 1;
end
i = i + 1;
while true
    while true
        if i > j
            return
        end
        if cons(i, b, k0)
            break;
        end
        i = i + 1;
    end
    i = i + 1;
    n = n + 1;
    while true
        if i > j
            return
        end
        if ~cons(i, b, k0)
            break;
        end
        i = i + 1;
    end
    i = i + 1;
end


% vowelinstem() is TRUE <=> k0,...j contains a vowel
function vis = vowelinstem(b, k0)
global j;
for i = k0:j,
    if ~cons(i, b, k0)
        vis = true;
        return
    end
end
vis = false;

%doublec(i) is TRUE <=> i,(i-1) contain a double consonant.
function dc = doublec(i, b, k0)
if i < k0+1
    dc = false;
    return
end
if b(i) ~= b(i-1)
    dc = false;
    return
end
dc = cons(i, b, k0);


% cvc(j) is TRUE <=> j-2,j-1,j has the form consonant - vowel - consonant
% and also if the second c is not w,x or y. this is used when trying to
% restore an e at the end of a short word. e.g.
%
%      cav(e), lov(e), hop(e), crim(e), but
%      snow, box, tray.

function c1 = cvc(i, b, k0)
if ((i < (k0+2)) || ~cons(i, b, k0) || cons(i-1, b, k0) || ~cons(i-2, b, k0))
    c1 = false;
else
    if (b(i) == 'w' || b(i) == 'x' || b(i) == 'y')
        c1 = false;
        return
    end
    c1 = true;
end

% ends(s) is TRUE <=> k0,...k ends with the string s.
function s = ends(str, b, k)
global j;
if (str(length(str)) ~= b(k))
    s = false;
    return
end % tiny speed-up
if (length(str) > k)
    s = false;
    return
end
if strcmp(b(k-length(str)+1:k), str)
    s = true;
    j = k - length(str);
    return
else
    s = false;
end

% setto(s) sets (j+1),...k to the characters in the string s, readjusting
% k accordingly.

function so = setto(s, b, k)
global j;
for i = j+1:(j+length(s))
    b(i) = s(i-j);
end
if k > j+length(s)
    b((j+length(s)+1):k) = '';
end
k = length(b);
so = {b, k};

% rs(s) is used further down.
% [Note: possible null/value for r if rs is called]
function r = rs(str, b, k, k0)
r = {b, k};
if measure(b, k0) > 0
    r = setto(str, b, k);
end

% step1ab() gets rid of plurals and -ed or -ing. e.g.

%       caresses  ->  caress
%       ponies    ->  poni
%       ties      ->  ti
%       caress    ->  caress
%       cats      ->  cat

%       feed      ->  feed
%       agreed    ->  agree
%       disabled  ->  disable

%       matting   ->  mat
%       mating    ->  mate
%       meeting   ->  meet
%       milling   ->  mill
%       messing   ->  mess

%       meetings  ->  meet

function s1ab = step1ab(b, k, k0)
global j;
if b(k) == 's'
    if ends('sses', b, k)
        k = k-2;
    elseif ends('ies', b, k)
        retVal = setto('i', b, k);
        b = retVal{1};
        k = retVal{2};
    elseif (b(k-1) ~= 's')
        k = k-1;
    end
end
if ends('eed', b, k)
    if measure(b, k0) > 0;
        k = k-1;
    end
elseif (ends('ed', b, k) || ends('ing', b, k)) && vowelinstem(b, k0)
    k = j;
    retVal = {b, k};
    if ends('at', b, k)
        retVal = setto('ate', b(k0:k), k);
    elseif ends('bl', b, k)
        retVal = setto('ble', b(k0:k), k);
    elseif ends('iz', b, k)
        retVal = setto('ize', b(k0:k), k);
    elseif doublec(k, b, k0)
        retVal = {b, k-1};
        if b(retVal{2}) == 'l' || b(retVal{2}) == 's' || ...
                b(retVal{2}) == 'z'
            retVal = {retVal{1}, retVal{2}+1};
        end
    elseif measure(b, k0) == 1 && cvc(k, b, k0)
        retVal = setto('e', b(k0:k), k);
    end
    k = retVal{2};
    b = retVal{1}(k0:k);
end
j = k;
s1ab = {b(k0:k), k};

%  step1c() turns terminal y to i when there is another vowel in the stem.
function s1c = step1c(b, k, k0)
global j;
if ends('y', b, k) && vowelinstem(b, k0)
    b(k) = 'i';
end
j = k;
s1c = {b, k};

% step2() maps double suffices to single ones. so -ization ( = -ize plus
% -ation) maps to -ize etc. note that the string before the suffix must give
% m() > 0.
function s2 = step2(b, k, k0)
global j;
s2 = {b, k};
switch b(k-1)
    case {'a'}
        if ends('ational', b, k) s2 = rs('ate', b, k, k0);
        elseif ends('tional', b, k) s2 = rs('tion', b, k, k0); end;
    case {'c'}
        if ends('enci', b, k) s2 = rs('ence', b, k, k0);
        elseif ends('anci', b, k) s2 = rs('ance', b, k, k0); end;
    case {'e'}
        if ends('izer', b, k) s2 = rs('ize', b, k, k0); end;
    case {'l'}
        if ends('bli', b, k) s2 = rs('ble', b, k, k0);
        elseif ends('alli', b, k) s2 = rs('al', b, k, k0);
        elseif ends('entli', b, k) s2 = rs('ent', b, k, k0);
        elseif ends('eli', b, k) s2 = rs('e', b, k, k0);
        elseif ends('ousli', b, k) s2 = rs('ous', b, k, k0); end;
    case {'o'}
        if ends('ization', b, k) s2 = rs('ize', b, k, k0);
        elseif ends('ation', b, k) s2 = rs('ate', b, k, k0);
        elseif ends('ator', b, k) s2 = rs('ate', b, k, k0); end;
    case {'s'}
        if ends('alism', b, k) s2 = rs('al', b, k, k0);
        elseif ends('iveness', b, k) s2 = rs('ive', b, k, k0);
        elseif ends('fulness', b, k) s2 = rs('ful', b, k, k0);
        elseif ends('ousness', b, k) s2 = rs('ous', b, k, k0); end;
    case {'t'}
        if ends('aliti', b, k) s2 = rs('al', b, k, k0);
        elseif ends('iviti', b, k) s2 = rs('ive', b, k, k0);
        elseif ends('biliti', b, k) s2 = rs('ble', b, k, k0); end;
    case {'g'}
        if ends('logi', b, k) s2 = rs('log', b, k, k0); end;
end
j = s2{2};

% step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
function s3 = step3(b, k, k0)
global j;
s3 = {b, k};
switch b(k)
    case {'e'}
        if ends('icate', b, k) s3 = rs('ic', b, k, k0);
        elseif ends('ative', b, k) s3 = rs('', b, k, k0);
        elseif ends('alize', b, k) s3 = rs('al', b, k, k0); end;
    case {'i'}
        if ends('iciti', b, k) s3 = rs('ic', b, k, k0); end;
    case {'l'}
        if ends('ical', b, k) s3 = rs('ic', b, k, k0);
        elseif ends('ful', b, k) s3 = rs('', b, k, k0); end;
    case {'s'}
        if ends('ness', b, k) s3 = rs('', b, k, k0); end;
end
j = s3{2};

% step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
function s4 = step4(b, k, k0)
global j;
switch b(k-1)
    case {'a'}
        if ends('al', b, k) end;
    case {'c'}
        if ends('ance', b, k)
        elseif ends('ence', b, k) end;
    case {'e'}
        if ends('er', b, k) end;
    case {'i'}
        if ends('ic', b, k) end;
    case {'l'}
        if ends('able', b, k)
        elseif ends('ible', b, k) end;
    case {'n'}
        if ends('ant', b, k)
        elseif ends('ement', b, k)
        elseif ends('ment', b, k)
        elseif ends('ent', b, k) end;
    case {'o'}
        if ends('ion', b, k)
            if j == 0
            elseif ~(strcmp(b(j),'s') || strcmp(b(j),'t'))
                j = k;
            end
        elseif ends('ou', b, k) end;
    case {'s'}
        if ends('ism', b, k) end;
    case {'t'}
        if ends('ate', b, k)
        elseif ends('iti', b, k) end;
    case {'u'}
        if ends('ous', b, k) end;
    case {'v'}
        if ends('ive', b, k) end;
    case {'z'}
        if ends('ize', b, k) end;
end
if measure(b, k0) > 1
    s4 = {b(k0:j), j};
else
    s4 = {b(k0:k), k};
end

% step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
function s5 = step5(b, k, k0)
global j;
j = k;
if b(k) == 'e'
    a = measure(b, k0);
    if (a > 1) || ((a == 1) && ~cvc(k-1, b, k0))
        k = k-1;
    end
end
if (b(k) == 'l') && doublec(k, b, k0) && (measure(b, k0) > 1)
    k = k-1;
end
s5 = {b(k0:k), k};