#!/bin/sh # # Scramble input ala Rob Pike's character: "Mark F. Shaney", Usenix 6/94. # # For each three consecutive words in the input, add the third to a list # of valid successors to the first two. At EOF, emit a Markov Chain, beginning # with the first two input words, picking a random successor from the # list such that the new word has the same likelihood of appearing after # those two in the output as it did in the input. Repeat, advancing one word, # using the new word as input. (Essentially jump haphazardly through the # input constraining each jump to be to a similar 3 word context.) # # The output is non-deterministic (each run is different). # # Rob Pike promised to post his code but I haven't seen it. This code # does what I think he said he did in his talk. # # The -j flag causes short input words to be combined with the following word # which makes the output somewhat more coherent for large amounts of input. # # This program is completely useless except that it may provide entertainment # if suitable input is given. Suggested input is about 10-30 thousand words # either of text you are quite familiar with or the combination of several # unrelated texts such as the bodies of different letters or news articles. # Rob used the bodies of net.singles (now soc.singles) articles with headers, # signatures, and embedded quotes of other articles deleted. The formatting # of lines, words, paragraphs in the input is irrelevant. The output will # be one long paragraph. # # This program is completely unsupported. Enjoy. # # Corey Satten 6/10/94 # PATH=/usr/local/bin:/bin:/usr/bin:/usr/ucb:$PATH for AWK in mawk nawk gawk awk none; do # find an awk for this script ($AWK 'BEGIN{++a[1]; if(1 in a); exit}') 2>/dev/null && break done case $AWK in none) echo "$0: needs nawk or mawk or gawk; sorry" 1>&2;exit;;esac for i in $*; do case "$1" in -j) JOIN='j=1'; shift;; # join small words onto following with "_" esac done # strip some punctuation and make each word be on a line by itself. (cat $* | tr -d '\\()\133\135"' | tr -s ' \011\012' '\012'; echo ThE_-_EnD) | # do the markov chain $AWK ' BEGIN { getline; while ($0~/^$/) getline; first1 = last2 = $0 getline; while ($0~/^$/) getline; first2 = last1 = $0 srand('"$$"') } /^$/ {next} { # process the input, build the successor lists if (j==1 && length($0) < 3) { save = $0; getline; $0 = save "_" $0 } i[last2 " " last1] = i[last2 " " last1] $0 " " # add to successor list last2 = last1; last1 = $0 # shift to next word if (++count % 300 == 0) {print count >"/dev/tty"; close("/dev/tty")} } END { # begin with first 2 words, emit the Markov chain. printf("%s\n%s\n", first1, first2) last2 = first1; last1 = first2 while (1) { s = split(i[last2 " " last1], a) nxt = a[int(rand() * s + 1)] if (nxt == "") {exit} print nxt last2 = last1; last1 = nxt } }' $JOIN | # collect words back into lines. fmt