This is my small contribution to this forum :rasta:
Text::Trim and List::Util you can download from cpan.org
Usage:
Text::Trim and List::Util you can download from cpan.org
Code:
package SG::Markov;
use Data::Dumper;
#use Lingua::EN::Sentence qw( get_sentences );
use Encode qw(encode_utf8);
use Text::Trim;
use List::Util qw(first max maxstr min minstr reduce shuffle sum);
use strict;
sub new {
my $pkg = shift;
my $gran = shift;
my $numwords = shift;
my $this = {};
bless $this,$pkg;
srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip`);
$this->{G} = $gran;
$this->{O} = $numwords;
$this->{min_paragraphLength} = 4;
$this->{max_paragraphLength} = 5;
return $this;
}
sub inputString {
my $this = shift;
my $string = shift;
# removes multiple spaces and other unncessary formating from text
$string =~ s/^\s+//;
$string =~ s/\s+/ /g;
$string =~ s/ +/ /g;
# save / add to originalText class var
$this->{originalText} = encode_utf8 trim($this->{originalText} . ' ' . $string);
# Remove all grabage
$this->{originalText} =~ s/[^\w\d\s\x21-\x7E]+/ /g;
}
#
# turns text into an array and returns the result
#
# @param string $text - used to create word array - if false, var originalText is used
# @return array word array
#
sub createWordArray {
my $this = shift;
my $text = shift;
if (!$text) {
if (!$this->{originalText}) {
print "Error: No input text!\n";
return undef;
}
$text = $this->{originalText};
}
return split(/[\s]+/, $text);
}
#
# creates an array (frequency table) that is used to determine
# what words should be used in relation to each other.
#
# @return bool - success or failure
#
sub frequencyTable {
my $this = shift;
if (!$this->{textwords}) {
return undef if (!(@{$this->{textwords}} = $this->createWordArray())); # no article text has been set!!
}
# set maximum number of entries for frequency table
my $loopmax = $#{$this->{textwords}}+1 - $this->{G} + 1;
my $frequency_table;
# fill up the frequency table
for (my $j = 0; $j < $loopmax; $j ++) {
my $key_string;
my $end = $j + $this->{G};
for (my $k = $j; $k < $end; $k ++) {
#build the key string (G - 1) words
$key_string .= $this->{textwords}->[$k].' ';
}
if (!defined($frequency_table->{'_'.$key_string})) { $frequency_table->{'_'.$key_string}=""; }
if (defined($this->{textwords}->[$j + $this->{G}])) { $frequency_table->{'_'.$key_string} .= $this->{textwords}->[$j + $this->{G}]." "; }
}
$this->{frequency_table} = $frequency_table;
return 1;
}
#
# generates a new document using the text provided. the frequency
# table, in combination with random numbers, to figure out what words
# should come before and after each other.
#
#
#
sub generate {
my $this = shift;
my $buffer;
my $output;
my @lastwords;
my $frequency_table = $this->{frequency_table};
my $textwords = $this->{textwords};
# start off the output by adding the first G words
# where G is the granularity integer
for (my $i = 0; $i < $this->{G}; $i ++) {
push @lastwords, $textwords->[$i]; # record last used words
$output .= " ".$textwords->[$i];
}
# only output O words or less (O is a class var)
for (my $i = 0; $i < $this->{O}; $i ++) {
# create key_string that is used to determine words will come next
my $key_string = "";
for (my $j = 0; $j < $this->{G}; $j ++) {
$key_string .= $lastwords[$j]." ";
}
# only if the above key_string has a corresponding record in the freq. table
# find all the words in that record and create an array. using that array
# find 1 word (at random)
if ($frequency_table->{'_'.$key_string}) {
my @possible = split(/ /, trim($frequency_table->{'_'.$key_string}));
my $c = $#possible + 1;
my $r = int(rand($c));
my $nextword = $possible[$r];
$output .= " $nextword";
# remove the first of the lastwords and add-on the most recent word
for (my $l = 0; $l < $this->{G} -1; $l ++) {
$lastwords[$l] = $lastwords[$l +1];
}
$lastwords[$this->{G} -1] = $nextword;
# if no key_string record was found, add all the last words onto the
# output again. (isn't this a bad thing to do?)
} else {
#
# The following was removed because it sometimes would output
# the same series of words over and over again
#$lastwords = array_splice($lastwords, 0, count($lastwords));
#for ($l = 0; $l < $this->G; $l ++) {
#$lastwords[] = $textwords[$l];
#$output .= ' '.$textwords[$l];
#}
#
}
} # end $i loop
return $this->format($output);
}
#
# format the text into paragraphs
#
#
sub format {
my $this = shift;
my $output = shift;
# split output into sentence array
#my $sentences=get_sentences($output);
#my @sentences = shuffle @$sentences;
$output =~ s/(\.|\?|!) /$1 <br>/g;
my @sentences = split(/ <br>/, $output);
# create paragraphs
$output = "";
while($#sentences+1 > 0) {
# determine sentence count for this paragraph
my $numberOfSentences = $this->{min_paragraphLength} + int(rand($this->{max_paragraphLength} - $this->{min_paragraphLength} + 1));
$output.="\n\n";
# now add all the sentences (up to limit)
for(my $i = 0; $i < $numberOfSentences; $i++) {
$output .= " ". pop(@sentences);
last if ($#sentences+1 == 0);
}
#$output .="\n\n";
}
return $output;
}
1;
Usage:
Code:
my $markov = SG::Markov->new(2, $word_cnt);
$markov->inputString($content);
$content = $markov->generate() if $markov->frequencyTable();