#awk program for Mac OS X to generate two output files from opticon csv data file.
#OP1 gives annealing vs time data for plotting in Excel.
#OP2 gives concentration calculations for use in determining Cot.
#Variables
#i is the field count (per row). After the first subroutine i= (number of fields +1) = (number of wells +4) because of three left hand index columns (read, cycle, step) in the csv output file.
#w[i] is the well name for column i (e.g. B10)
#a[j] is the number of samples in a row j (numbered 1-8 which corresponds to rows A through H). Since there should always be an equal number of samples and references in each row, a[j] should be an even number from 0-8
#v[h] is the position column h in the opticon raw data file should take in the final formatted file, so that all the reference wells are on the left hand side of the spreadsheet and all the sample wells are on the right hand side of the spreadsheet. Because of the left hand index fields in the raw data file h ranges from 4 to (number of samples + 3) while v[h] ranges from 1 to (number of samples).
#r is the array index for each read (starting from 1 for the first data row)
#value[r,g] tracks the left hand index fields (g=1 to 3) for each row r. If the Opticon run follows conventions, then the melt step and final annealing step can be identified from these fields.
#ordered[r,v[g]] is the raw fluorescence value for a read placed into the correct sample column after a spreadsheet has been ordered. r is the row and v[g] is the column for the cell.
#melt is the read number immediately following the melt step (defined as a cycle with exactly 5 steps, since the output file does not specify temp) It is also the t0. The cycle number for baseline measurement will be melt-6
#last is the last read before a melting curve (the melting curve is detected because it has more than 100 cycles).
#final is the last read to be plotted in the annealing graph (to avoid confusion with melting curve data or file annotations). It is defined by the maximum value of r before escaping the loop (oldr) if there is no melting curve or by the value last if there is a melting curve.
#label[ww] is the sample name given to well ww, taken from annotation at the bottom of the Opticon data file. This will be put into the summary sheet of OP2.csv.
#zero[q] is the fluorescence corresponding to 0% annealing for sample q. It is calculating by dividing the fluorescence intensity of a sample well during the gradient (melt) incubation by the fluorescence intensity of the corresponding reference well. The reference well intensity several reads before the gradient step is used, because the reference well is actually cooled slightly from the annealing temperature during the gradient step and this will artifactually increase its fluorescence. This problem also explains why the plot[0,q] data point is measured using a reference well fluorescence measured several reads beforehand.
#baseline[q] is the normalized fluorescence corresponding to 100% annealing before the melt (assuming the sample is in 100% homoduplex form before the annealing reaction). It is not yet corrected for the zero point background fluorescence.
#plot[r,q] is the annealing plot value (after normalization to reference and correction for zero and baseline fluorescence values) for read r and sample q.
#conc[q] is the relative concentration of an individual sample, measured as the fluorescence intensity of the sample and reference wells during the baseline step minus fluorescence intensity of the sample well during the zero step.
#t is the time, with time 0 corresponding to "melt"--the time at which the annealing process starts.
#Other loop variables include tt, t, p, q, qq, dd, ww
BEGIN {FS = ","; OFS = ","; i = 4; r=1; t=0; tt=1; ww=1}
#Read the well names from the first record and figure out how many wells were used in each row
{
while (i < NF) {
w[i-3] = $i
{if (substr($i,1,1) == "A") {a[1]++}}
{if (substr($i,1,1) == "B") {a[2]++}}
{if (substr($i,1,1) == "C") {a[3]++}}
{if (substr($i,1,1) == "D") {a[4]++}}
{if (substr($i,1,1) == "E") {a[5]++}}
{if (substr($i,1,1) == "F") {a[6]++}}
{if (substr($i,1,1) == "G") {a[7]++}}
{if (substr($i,1,1) == "H") {a[8]++}}
i++
}
#use the well names to determine the pairs of sample and reference wells
h = 4; k = 1; j = 1
while (j <= 8) {
q = 1
while (q <= (a[j]/2)) {
v[h] = k
v[h +(a[j]/2)] = k - 2 + (i/2)
k++
h++
q++
}
h = h + (a[j]/2)
j++
}
#now load rest of data into array
#keep track of length of each cycle to help identify the relevant ones
while (r < 10000) {
g = 1
while (g <= 3) {
value[r,g] = $g
{if (length ==1) r = 100000}
g++
}
while (g < NF) {
ordered[r,v[g]] = $g
g++
}
if ((value[r,3] > value[oldr,3]) && (value[oldr,2] == 5)) {melt = r}
if (value[r,2] == 101) {last = r - 102}
if (r < 10000) oldr = r
r++
next
}
#capture well labels
if (length($3) > 1) {label[ww]=substr($3,1,length($3)-1); ww++}
}
END{
#process annealing curves
q = 1
while (q <= (i/2) - 2) {
zero[q] = ((ordered[melt-4,(q + (i/2) -2)] + ordered[melt-3,(q + (i/2) -2)] + ordered[melt-2,(q + (i/2) -2)] + ordered[melt-1,(q + (i/2) -2)])/(ordered[melt-10,q] + ordered[melt-9,q] + ordered[melt-8,q] + ordered[melt-7,q]))
baseline[q] = ((ordered[melt-9,(q + (i/2) -2)] + ordered[melt-8,(q + (i/2) -2)] + ordered[melt-7,(q + (i/2) -2)] + ordered[melt-10,(q + (i/2) -2)])/(ordered[melt-9,q] + ordered[melt-8,q] + ordered[melt-7,q] + ordered[melt-10,q]))
{if (last > 0) {finish = last} else {finish = oldr}}
plot[0,q] = (((ordered[melt,(q + (i/2) -2)]/ordered[melt-7,q])-zero[q])/(baseline[q]-zero[q]))
p=melt+1
{while (p <= finish) {
plot[(p - melt),q] = (((ordered[p,(q + (i/2) -2)]/ordered[p,q])-zero[q])/(baseline[q]-zero[q]))
p++
}}
q++
}
#output column headers to OP1
printf("Time") >> "OP1.csv"
q = 1; qq = 1
while (qq<=8) {
dd=1
while (dd <= a[qq]/2) {
printf(",") >> "OP1.csv"
printf w[q] >> "OP1.csv"
q++
dd++
}
q += (a[qq]/2)
qq++
}
#Output annealing curves to OP1
{while (t <= (finish - melt)) {
printf("\n") >> "OP1.csv"
printf t >> "OP1.csv"
printf(",") >> "OP1.csv"
tt=1
{while (tt <= (i/2) - 2) {
printf (plot[t,tt] ",") >> "OP1.csv"
tt++
}}
t++
}}
#calculate concentrations
q = 1
while (q<= (i/2) -2) {
conc[q] = .125*(ordered[melt-39,(q + (i/2) -2)] + ordered[melt-39,q] + ordered[melt-38,(q + (i/2) -2)] + ordered[melt-38,q] + ordered[melt-37,(q + (i/2) -2)] + ordered[melt-37] + ordered[melt-36,(q + (i/2) -2)] + ordered[melt-36]) - .25*(ordered[melt-4,(q + (i/2) -2)] + ordered[melt-3,(q + (i/2) -2)] + ordered[melt-2,(q + (i/2) -2)] +ordered[melt-1,(q + (i/2) -2)])
q++
}
#output column headers to OP2
printf "Well" >> "OP2.csv"
q = 1; qq = 1
while (qq<=8) {
dd=1
while (dd <= a[qq]/2) {
printf "," w[q] >> "OP2.csv"
q++
dd++
}
q += (a[qq]/2)
qq++
}
{printf "\n" >> "OP2.csv"}
#output labels to OP2
q = 1
while (q<= (i/2) -2) {
printf label[q] "," >> "OP2.csv"
q++
}
{printf label[q] "," >> "OP2.csv"}
#output concentrations to OP2
{printf "\nConcentration" >> "OP2.csv"}
q = 1
while (q<= (i/2) -2) {
printf "," conc[q] >> "OP2.csv"
q++
}
}