Markov perl module

Status
Not open for further replies.

kekc

New member
Nov 16, 2006
143
7
0
ɐpɐuɐɔ
This is my small contribution to this forum :rasta:

Text::Trim and List::Util you can download from cpan.org

Code:
package SG::Markov;

use Data::Dumper;
#use Lingua::EN::Sentence qw( get_sentences );
use Encode qw(encode_utf8);
use Text::Trim;
use List::Util qw(first max maxstr min minstr reduce shuffle sum);
use strict;

sub new {
	my $pkg = shift;
    my $gran = shift;
    my $numwords = shift;
    
	my $this = {};
	bless $this,$pkg;
	
	srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip`);
	
	$this->{G} = $gran;
	$this->{O} = $numwords;
	$this->{min_paragraphLength} = 4;
	$this->{max_paragraphLength} = 5;
	
	return $this;
}

sub inputString {
	my $this = shift;
	my $string = shift;
	# removes multiple spaces and other unncessary formating from text
	$string =~ s/^\s+//;
	$string =~ s/\s+/ /g;
	$string =~ s/ +/ /g;
	# save / add to originalText class var
	$this->{originalText} = encode_utf8 trim($this->{originalText} . ' '  . $string);
	# Remove all grabage
	$this->{originalText} =~ s/[^\w\d\s\x21-\x7E]+/ /g;
}



#
# turns text into an array and returns the result
# 
# @param  string $text - used to create word array - if false, var originalText is used
# @return array word array
#
sub createWordArray {
	my $this = shift;
	my $text = shift;
	if (!$text) {
		if (!$this->{originalText}) {
			print "Error: No input text!\n";
			return undef;
		}
		$text = $this->{originalText};
	}
	return split(/[\s]+/, $text);
}


#
# creates an array (frequency table) that is used to determine
# what words should be used in relation to each other.
# 
# @return bool - success or failure
#
sub frequencyTable {
	my $this = shift;
	if (!$this->{textwords}) {
		return undef if (!(@{$this->{textwords}} = $this->createWordArray()));  # no article text has been set!!
	}

	# set maximum number of entries for frequency table
	my $loopmax = $#{$this->{textwords}}+1 - $this->{G} + 1;
	my $frequency_table;
	# fill up the frequency table
	for (my $j = 0; $j < $loopmax; $j ++) {
		my $key_string;
		my $end = $j + $this->{G};
		for (my $k = $j; $k < $end; $k ++) {
			#build the key string (G - 1) words
			$key_string .= $this->{textwords}->[$k].' ';
		}
		if (!defined($frequency_table->{'_'.$key_string})) { $frequency_table->{'_'.$key_string}=""; }
		if (defined($this->{textwords}->[$j + $this->{G}])) { $frequency_table->{'_'.$key_string} .= $this->{textwords}->[$j + $this->{G}]." "; }
	}
	$this->{frequency_table} = $frequency_table;
	return 1;
}
        
        
#
# generates a new document using the text provided.  the frequency
# table, in combination with random numbers, to figure out what words
# should come before and after each other.
# 
# 
#
sub generate {
	my $this = shift;
	my $buffer;
	my $output;
	my @lastwords;
	my $frequency_table = $this->{frequency_table};
	my $textwords = $this->{textwords};
	# start off the output by adding the first G words
	# where G is the granularity integer
	for (my $i = 0; $i < $this->{G}; $i ++) {
		push @lastwords, $textwords->[$i];  # record last used words
		$output .= " ".$textwords->[$i];
	}
	# only output O words or less (O is a class var)
	for (my $i = 0; $i < $this->{O}; $i ++) {
		# create key_string that is used to determine words will come next
		my $key_string = "";
		for (my $j = 0; $j < $this->{G}; $j ++) {
			$key_string .= $lastwords[$j]." ";
		}
		# only if the above key_string has a corresponding record in the freq. table
		# find all the words in that record and create an array.  using that array
		# find 1 word (at random)
		if ($frequency_table->{'_'.$key_string}) {
			my @possible = split(/ /, trim($frequency_table->{'_'.$key_string}));
			my $c = $#possible + 1;
			my $r = int(rand($c));
			my $nextword = $possible[$r];
			$output .= " $nextword";
			# remove the first of the lastwords and add-on the most recent word
			for (my $l = 0; $l < $this->{G} -1; $l ++) {
				$lastwords[$l] = $lastwords[$l +1];
			}
			$lastwords[$this->{G} -1] = $nextword;
			# if no key_string record was found, add all the last words onto the
			# output again.  (isn't this a bad thing to do?)
		} else {
			#
			# The following was removed because it sometimes would output
			# the same series of words over and over again 
			#$lastwords = array_splice($lastwords, 0, count($lastwords));
			#for ($l = 0; $l < $this->G; $l ++) {
			#$lastwords[] = $textwords[$l];
			#$output .= ' '.$textwords[$l];
			#}
			#
		}
	} # end $i loop
	return $this->format($output);
}

#
# format the text into paragraphs
# 
#
sub format {
	my $this = shift;
	my $output = shift;
	# split output into sentence array
	#my $sentences=get_sentences($output);
	#my @sentences = shuffle @$sentences;
	$output =~ s/(\.|\?|!) /$1 <br>/g;
	my @sentences = split(/ <br>/, $output);

	# create paragraphs
	$output = "";
	while($#sentences+1 > 0) {
		# determine sentence count for this paragraph
		my $numberOfSentences = $this->{min_paragraphLength} + int(rand($this->{max_paragraphLength} - $this->{min_paragraphLength} + 1));
		$output.="\n\n";
		# now add all the sentences (up to limit)
		for(my $i = 0; $i < $numberOfSentences; $i++) {
			$output .= " ". pop(@sentences);
			last if ($#sentences+1 == 0);
		}
		#$output .="\n\n";
	}
	return $output;
}

1;

Usage:

Code:
    my $markov = SG::Markov->new(2, $word_cnt);
    $markov->inputString($content);
    $content = $markov->generate() if $markov->frequencyTable();
 
  • Like
Reactions: joe


Status
Not open for further replies.