Author Topic: I don't think I quite understand Markov chains, this doesn't seem correct  (Read 597 times)

Code: [Select]
if(!isObject(MarkovList)) {
new GuiTextListCtrl(MarkovList);
}

if(!isObject(MarkovWordIndexes)) {
new GuiTextListCtrl(MarkovWordIndexes);
}

function getMarkovWordIndex(%word) {
for(%i=0;%i<MarkovWordIndexes.rowCount();%i++) {
%potential = MarkovWordIndexes.getRowText(%i);
if(%potential $= %word) {
return %i;
}
}

return -1;
}


function determineMarkovCandidates() {
%pattern = "Add-Ons/Server_Markov/texts/*.txt";
%filename = findFirstFile(%pattern);

%file = new FileObject();

%files = 0;
%words = 0;

%start = getRealTime();

while(isFile(%filename)) {
%file.openForRead(%filename);
%files++;

echo("Reading" SPC %filename);
talk("Reading" SPC %filename);

while(!%file.isEOF()) {
%line = %file.readLine();
echo("Line :: " SPC %line);

for(%i=0;%i<getWordCount(%line);%i++) {
%current_word = getWord(%line, %i);
if(%current_word $= "" || %current_word $= " ") {
continue;
}
%next_word = "";

%word_index = getMarkovWordIndex(%current_word);

if(%word_index == -1) {
%word_index = MarkovWordIndexes.addRow(MarkovWordIndexes.rowCount(), %current_word, MarkovWordIndexes.rowCount());
}

echo(%current_word SPC MarkovList.rowCount() SPC %word_index SPC getFieldCount(MarkovList.getRowText(%word_index)));

%words++;

if(%i < getWordCount(%line)) {
%next_word = getWord(%line, %i+1);
}

%row = MarkovList.getRowTextByID(%word_index);
%num = MarkovList.getRowNumByID(%word_index);

%found_piece = -1;
%new = "";

if(%row $= "") {
if(%next_word $= "") {
%text = "";
} else {
%text = %next_word SPC 1;
}

MarkovList.setRowByID(%word_index, %text);
echo("NEW WORD:" SPC %current_word);
continue;
} else {
if(%next_word $= "") {
continue;
}

for(%j=0;%j<getFieldCount(%row);%j++) {
%piece[%j] = getField(%row, %j);
%piece_word = getWord(%piece[%j], 0);

if(%piece_word $= %next_word) {
%found_piece = %j;
%piece[%j] = %piece_word SPC (getWord(%piece[%j], 1) + 1);
break;
}
}

if(%found_piece != -1) {
for(%j=0;%j<getFieldCount(%row);%j++) {
if(%j == %found_piece) {
%new = trim(%new TAB %piece[%j]);
} else {
%new = trim(%new TAB getField(%row, %j));
}
}
} else {
%new = %row TAB %next_word SPC 1;
}

MarkovList.setRowByID(%word_index, %new);

//echo(MarkovList.getRowText(%word_index));
//echo(MarkovList.getRowText(%word_index));
}
}
}

%file.close();

%filename = findNextFile(%pattern);
}

talk("Chained\c3" SPC %words SPC "words \c6from\c3" SPC %files SPC "files \c6in\c2" SPC getTimeString((getRealTime() - %start)/1000) @ ".");
}

function doMarkov() {
%word_index = getRandom(0, MarkovWordIndexes.rowCount());
%phrase = MarkovWordIndexes.getRowText(%word_index);
%last_char = getSubStr(%phrase, strLen(%phrase)-1, 1);

%iterations = 0;
while(%last_char !$= ".") {
%iterations++;
if(%iterations > 30) {
break;
}

%candidates = MarkovList.getRowText(%word_index);
//%next_word = getWord(getField(%candidates, getRandom(0, getFieldCount(%candidates)-1)), 0);

%total = 0;
//echo(getFieldCount(%candidates) SPC "candidates from" SPC MarkovWordIndexes.getRowText(%word_index) SPC "[" @ %word_index @ "]");
for(%i=0;%i<getFieldCount(%candidates);%i++) {
//echo("Getting candidate" SPC %i);
%row = getField(%candidates, %i);
%candidate[%i] = %total SPC getWord(%row, 0);
%total += getWord(%row, 1);
}

%chosen = getRandom(0, %total);
//echo("Chose" SPC %chosen SPC "from" SPC %total);
for(%i=getFieldCount(%candidates)-1;%i>=0;%i--) {
//echo(%i);
%min = getWord(%candidate[%i], 0);
if(getFieldCount(%candidates) == 1) {
%next_word = getWord(%candidate[%i], 1);
echo("Only 1 candidate, next word:" SPC %next_word);
break;
}

%max = getWord(%candidate[%i+1], 0)+1;

//echo("Min/Max:" SPC %min SPC %max);

if(%chosen >= %min && %chosen <= %max) {
%next_word = getWord(%candidate[%i], 1);
echo("Next word:" SPC %next_word);
break;
}
}

%word_index = getMarkovWordIndex(%next_word);
//echo(%next_word SPC %word_index);

if(%word_index == -1) {
echo("ERROR: %word_index returned -1 in generation:" SPC %next_word);
break;
}

%phrase = trim(%phrase SPC %next_word);
%last_char = getSubStr(%phrase, strLen(%phrase)-1, 1);
}

echo(%phrase);
talk(%phrase);
}

function reloadMarkov() {
if(isObject(MarkovList)) {
MarkovList.delete();
new GuiTextListCtrl(MarkovList);
}

if(isObject(MarkovWordIndexes)) {
MarkovWordIndexes.delete();
new GuiTextListCtrl(MarkovWordIndexes);
}

talk("GENERATING MARKOV CANDIDATES.");
schedule(200,0,determineMarkovCandidates);
}

I assumed Markov chains are just things that can happen in succession based off of things that have already happened in succession, e.g.:

"this is a test and this test is a great test"
this could branch out to either is or test
is can only branch out to a
test can branch out to and and is

and so on

am i correct? I'm getting off the wall things like these, yet I'm feeding almost 30,000 words into it.

looks about right

things make a bit more sense when you lead in with 2+ more words instead of just one

ie
this is -> a
is a -> test, great