<steven.huwig@[EMAIL PROTECTED]
> schreef in bericht
news:f31214a5-f659-4d9a-8133-a6f2f89b4d76@[EMAIL PROTECTED]
>I didn't use the Google Groups form correctly and the posted code was
> not valid. Hopefully the below comes out right.
>
>
> # Waterman's Algorithm R for random sampling
> # by way of Knuth's The Art of Computer Programming, volume 2
>
> BEGIN {
> if (!n) {
> print "Usage: sample.awk -v n=[size]"
> exit
> }
> t = n
> srand()
> }
>
> NR <= n {
> pool[NR] = $0
> places[NR] = NR
> next
> }
>
> NR > n {
> t++
> M = int(rand()*t) + 1
> if (M <= n) {
> READ_NEXT_RECORD(M)
> }
> }
>
> END {
> if (NR < n) {
> print "sample.awk: Not enough records for sample" \
> > "/dev/stderr"
> exit
> }
> # gawk needs a numeric sort function
> # since it doesn't have one, zero-pad and sort alphabetically
> pad = length(NR)
> for (i in pool) {
> new_index = sprintf("%0" pad "d", i)
> newpool[new_index] = pool[i]
> }
> x = asorti(newpool, ordered)
> for (i = 1; i <= x; i++)
> print newpool[ordered[i]]
> }
>
> function READ_NEXT_RECORD(idx) {
> rec = places[idx]
> delete pool[rec]
> pool[NR] = $0
> places[idx] = NR
> }
>
>
> # Vitter's Algorithm Z for random sampling
> # http://doi.acm.org/10.1145/3147.3165
#
>
> BEGIN {
> if (!n) {
> print "Usage: sample.awk -v n=[size]"
> exit
> }
> T = 10 # tuneable per (Vitter 85)
> srand()
> nplusone = n + 1
> }
>
> NR == nplusone {
> pool[NR] = $0
> places[NR - 1] = NR
> t = n
> thresh = T*n
> num = 0
> }
>
> NR > n && t > thresh {
> if (!W) {
> W = exp(-log(rand())/n)
> term = t - n + 1
> }
> while (1) {
> U = rand()
> X = t*(W - 1.0)
> S = int(X)
> lhs = exp(log(((U*(((t + 1)/term)^2))*(term + S))/(t + X))/n)
> rhs = (((t + X)/(term + S))*term)/t
> if (lhs <= rhs) {
> W = rhs/lhs
> break
> }
> y = (((U*(t + 1))/term)*(t + S + 1))/(t + X)
> if (n < S) {
> denom = t
> numer_lim = term + S
> } else {
> denom = t - n + S
> numer_lim = t + 1
> }
> for (numer = t + S; numer > numer_lim; numer--) {
> y = (y*numer)/denom
> denom--
> }
> W = exp(-log(rand())/n)
> if (exp(log(y)/n) <= (t + X)/t)
> break
> }
> if (SKIP_RECORDS(S))
> READ_NEXT_RECORD(int(n*rand()))
> else
> exit
> t = t + S + 1
> term = term + S + 1
> next
> }
>
> NR > n && t <= thresh {
> V = rand()
> S = 0
> t++
> num++
> quot = num/t
> while (quot > V) {
> S++
> t++
> num++
> quot = (quot*num)/t
> }
> if (SKIP_RECORDS(S))
> READ_NEXT_RECORD(int(n * rand()))
> else
> exit
> next
> }
>
> NR <= n {
> pool[NR] = $0
> places[NR - 1] = NR
> next
> }
>
> END {
> if (NR < n) {
> print "sample.awk: Not enough records for sample" \
> > "/dev/stderr"
> exit
> }
> # gawk needs a numeric sort function
> # since it doesn't have one, zero-pad and sort alphabetically
> pad = length(NR)
> for (i in pool) {
> new_index = sprintf("%0" pad "d", i)
> newpool[new_index] = pool[i]
> }
> x = asorti(newpool, ordered)
> for (i = 1; i <= x; i++)
> print newpool[ordered[i]]
> }
>
> function SKIP_RECORDS(skip) {
> retval = 1
> for (i = 0; i < skip; i++)
> retval = getline
> return retval > 0
> }
>
> function READ_NEXT_RECORD(idx) {
> rec = places[idx]
> delete pool[rec]
> pool[NR] = $0
> places[idx] = NR
> }
>
i tried something
$ wc -l american
212710 american
$ while true; do awk -v n=1 vitter-sample.awk american ; done
unpersonalized
unpersonalized
kerosene
kerosene
kerosene
kerosene
kerosene
Kiyo****
Kiyo****
Kiyo****
Kiyo****
Kiyo****
Navratilova's
Navratilova's
Navratilova's
Navratilova's
Navratilova's
pass****t's
$ while true; do awk -v n=1 waterman-sample.awk american ; done
physio
physio
physio
AA
petulant
AA
petulant
AA
petulant
AA
petulant
AA
petulant
AA
petulant
AA
petulant
AA
petulant
AA
incineration's
AA
incineration's
At 1st i was amazed by the randomness of this script,
but i think i need an expert on randomness to tell why above is
happening...
(but still, thanks for sharing the scripts!)


|