PandA-2024.02
Nussinov.orig.c
Go to the documentation of this file.
1 
10 /*
11 Nussinov.c
12 
13 Builds a random sequence of base pairs, runs the Nussinov algorithm.
14 Unless FOUR_WAY_MAX_WITH_REDUNDANCY is define'd to true, it uses the more concise version as per the comment at the end of the file.
15 
16 Programmed by Dave Wonnacott at Haverford College <davew@cs.haverford.edu>, with help from Allison Lake, Ting Zhou, and Tian Jin, based on algorithm by Nussinov, described in Allison Lake's senior thesis.
17 
18 
19 compile&run with:
20  gcc -O3 -DNDEBUG -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov && ./Nussinov
21 
22 Or, to set a specific size such as 5000, use:
23  gcc -O3 -DNDEBUG -Dsize=5000 -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov && ./Nussinov
24 
25 -------------
26 For debugging/verifying we hit all the right points and have all updates before any pure reads:
27  gcc -Dsize=250 -DCHECK_DEPENDENCES=1 -DVERBOSE=1 -DVERBOSE_OUT=stderr -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov && ./Nussinov 2> /tmp/Nussinov-log-250.txt ; sort /tmp/Nussinov-log-250.txt -o /tmp/Nussinov-log-250.txt
28 
29 -------------
30 Re-generate small or big matrix for comparison:
31  gcc -O3 -DPRINT_SIZE=5000 -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov-print && echo 250 | ./Nussinov-print > printed-matrix-250.out
32 Or do the comparison
33  gcc -O3 -DPRINT_SIZE=5000 -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov-print && echo 250 | ./Nussinov-print | diff printed-matrix-250.out -
34 
35 -------------
36 Re-generate big matrices for comparison:
37  gcc -O3 -Dsize=5000 -DPRINT_SIZE=5000 -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov && ./Nussinov > printed-matrix-5000.out
38 
39  gcc -DSEED=17 -O3 -Dsize=5000 -DPRINT_SIZE=5000 -Wall -Wno-unused-value -Wno-unknown-pragmas Nussinov.c -o Nussinov && ./Nussinov > printed-matrix-5000-seed17.out
40 
41 */
42 
43 #include <string.h>
44 #include <math.h>
45 #include <stdlib.h>
46 #include <stdio.h>
47 #include <time.h>
48 #include <sys/time.h>
49 
50 typedef int bool;
51 const int true = 1;
52 const int false = 0;
53 
54 #if ! defined SCALAR_REPLACEMENT
55 #define SCALAR_REPLACEMENT 0
56 #endif
57 
58 #if ! defined CHECK_DEPENDENCES
59 #define CHECK_DEPENDENCES false
60 #endif
61 
62 #if defined NDEBUG
63 #define eassert(EXPR) 1
64 #else
65 #define eassert(EXPR) eassert_func(__STRING(EXPR), EXPR)
66 void eassert_func(char *expr, bool value)
67 {
68  if (!value) {
69  fprintf(stderr, "assertion failed: %s\n", expr);
70  exit(1);
71  // printf("assertion failed: %s\n", expr);
72  }
73 }
74 #endif
75 
76 
77 
78 //#define MAX_SIZE 16307
79 
80 #if ! defined FOUR_WAY_MAX_WITH_REDUNDANCY
81 #define FOUR_WAY_MAX_WITH_REDUNDANCY false
82 #endif
83 #if FOUR_WAY_MAX_WITH_REDUNDANCY
84 #define ZERO_IF_NO_REDUNDANCY 1
85 #else
86 #define ZERO_IF_NO_REDUNDANCY 0
87 #endif
88 
89 #if ! defined PRINT_SIZE
90 #define PRINT_SIZE 48 /* print for debugging if not bigger than this */
91 #endif
92 #if ! defined VERBOSE
93 #define VERBOSE false
94 #endif
95 #if ! defined VERBOSE_OUT
96 #define VERBOSE_OUT stdout
97 #endif
98 #if VERBOSE
99 #if ! defined REALLY_VERBOSE
100 #define REALLY_VERBOSE false
101 #endif
102 #endif
103 
104 #if ! defined SEED
105 #define SEED 42
106 #endif
107 
108 #define SLOWER (CHECK_DEPENDENCES | VERBOSE)
109 
110 double cur_time(void)
111 {
112  struct timeval tv;
113  struct timezone tz;
114  gettimeofday(&tv, &tz);
115  return tv.tv_sec + tv.tv_usec*1.0e-6;
116 }
117 
118 // for bases, use 0, 1, 2, 3, with (0, 3) and (1, 2) being matches
119 char *base_print[4] = { "A", "C", "G", "U" };
120 typedef int base; // could also try char, short
121 typedef int score; // could use uint, short, anything that can count high enough
122 
124 {
125  return (b1+b2) == 3 ? 1 : 0;
126 }
127 
128 inline score max_score(score s1, score s2)
129 {
130  return (s1 >= s2) ? s1 : s2;
131 }
132 
135 #define debug_N(x, y) (N_array[x][y]) /* read secretly without triggering has_been_read */
136 
137 #if ! CHECK_DEPENDENCES
138 #define N(x, y) (eassert(0 <= x && x < size && 0 <= y && y < size), N_array[x][y])
139 #if SCALAR_REPLACEMENT
140 #define MAX_N_DECLS() int max_tmp_i, max_tmp_j; score max_tmp
141 #define MAX_N_START(x,y) ((max_tmp_i=x), (max_tmp_j=y), (max_tmp = 0))
142 #define MAX_N(x, y, v) (eassert(max_tmp_i==x && max_tmp_j==y), eassert(0 <= x && x < size && 0 <= y && y < size), (max_tmp = max_score(max_tmp, v)))
143 #define MAX_N_END(x,y) (eassert(max_tmp_i==x && max_tmp_j==y), ((N_array[x][y]) = max_score(N_array[x][y], max_tmp)))
144 #else /* else not scalar replacement (inside check_deps) */
145 #define MAX_N(x, y, v) (eassert(0 <= x && x < size && 0 <= y && y < size), ((N_array[x][y]) = max_score(N_array[x][y], v)))
146 #endif /* else not scalar replacement (inside check_deps) */
147 #else /* not check_deps */
148 bool N_array_has_been_read[MAX_SIZE][MAX_SIZE];
149 #define N(x, y) (eassert(0 <= x && x < size && 0 <= y && y < size), \
150  (REALLY_VERBOSE?fprintf(VERBOSE_OUT, "i, j, k = %d, %d, %d: reading N[%d][%d]\n", i, j, k, x, y):1), \
151  (N_array_has_been_read[x][y] = (true)), \
152  N_array[x][y]+0)
153 #if SCALAR_REPLACEMENT /* inside not check_deps */
154 #error("Not yet ready to do scalar replacement and check_deps at the same time :-(")
155 #else /* else not scalar replacement (inside not check_deps) */
156 #define MAX_N(x, y, v) (eassert(0 <= x && x < size && 0 <= y && y < size), \
157  eassert(!N_array_has_been_read[x][y]), \
158  (N_array[x][y] = max_score(N_array[x][y], v)))
159 #endif /* else not scalar replacement (inside not check_deps) */
160 #endif
161 #if ! SCALAR_REPLACEMENT
162 #define MAX_N_DECLS()
163 #define MAX_N_START(x,y)
164 #define MAX_N_END(x,y)
165 #endif
166 
167 
168 /* Convenience function */
169 int getint(char *prompt)
170 {
171 #if VERBOSE_OUT == stderr
172  char *terminate = "\n";
173 #else
174  char *terminate = "";
175 #endif
176  int result;
177  int i=0;
178  while (
179  fprintf(stderr, "%s%s", prompt, terminate),
180  result = scanf("%d", &i),
181  result != 1 && result != EOF
182  ) {
183  fprintf(stderr, "Sorry, I didn't understand that...\n");
184  }
185  if (result == 1) {
186  return i;
187  } else {
188  fprintf(stderr, "Giving up ... can't read input\n");
189  exit(1);
190  }
191 }
192 
193 
194 int main(int argc, char *argv[])
195 {
196 #if ! SLOWER
197  double start_time, end_time; // , speed;
198 #endif
199 #if ! defined size
200  int size = getint("Enter length of random mRNA sequence (2200 is average for human mRNA): "); // Average (human) mRNA length is 2200; there's one that's 5000, though
201 #endif
202 
203  int i, j, k=-1;
204  MAX_N_DECLS();
205  char *options;
206 #if VERBOSE
207 #if CHECK_DEPENDENCES
208  options = " [DV]";
209 #else
210  options = " [V]";
211 #endif
212 #else
213 #if CHECK_DEPENDENCES
214  options = " [D]";
215 #else
216  options = "";
217 #endif
218 #endif
219 
220  printf("Running Nussinov RNA algorithm%s for sequence of length %d, random data with seed %d.\n",
221  options, size, SEED);
222 
223  if (size > MAX_SIZE) {
224  fprintf(stderr, "size (%d) < MAX_SIZE (%d)\n", MAX_SIZE, size);
225  exit(1);
226  }
227 
228  /* seed it with a constant so we can check/compare the results */
229  srand(SEED);
230  for (i = 0; i < size; i++)
231  seq[i] = rand() % 4;
232 
233 #if ! SLOWER
234  start_time = cur_time();
235 #endif
236 
237 // "OPTION 1"
238 #pragma scop
239  for (i = size-1; i >= 0; i--) {
240  for (j=i+1; j<size; j++) {
241 #if VERBOSE
242  fprintf(VERBOSE_OUT, "i, j, k = %d, %d, %d\n", i, j, k); /* outer k is -1 to indicate no k */
243 #endif
244  MAX_N_START(i, j);
245 #if FOUR_WAY_MAX_WITH_REDUNDANCY
246  if (j-1>=0) MAX_N(i, j, N(i, j-1));
247  if (i+1<size) MAX_N(i, j, N(i+1, j));
248 #endif
249  if (j-1>=0 && i+1<size) {
250  if (i<j-1) MAX_N(i, j, N(i+1, j-1)+match(seq[i], seq[j])); /* don't allow adjacent elements to bond */
251  else MAX_N(i, j, N(i+1, j-1));
252  }
253 
254  {
255  int k; /* local k to allow N macro to look at k and get -1 above and real k here */
256  for (k=i+ZERO_IF_NO_REDUNDANCY; k<j; k++) {
257 #if VERBOSE
258  fprintf(VERBOSE_OUT, "i, j, k = %d, %d, %d\n", i, j, k);
259 #endif
260  MAX_N(i, j, N(i, k)+N(k+1, j));
261  }
262  } /* end of local k */
263  MAX_N_END(i, j);
264  }
265  }
266 #pragma endscop
267 
268 #if !SLOWER
269  end_time = cur_time();
270  printf("done.\nTime elapsed: %fs\n", end_time-start_time);
271 #endif
272  printf("N(0, size-1) = %d\n", N(0, size-1));
273 
274  if (size <= PRINT_SIZE) {
275  for (i=0; i<size; i++)
276  printf("%3s ", base_print[seq[i]]);
277  printf("\n");
278  for(i = 0; i < size; i++) {
279  for(j = 0; j < size; j++) printf("%3d ", debug_N(i, j));
280  printf("\n");
281  }
282  }
283 
284  eassert(k == -1);
285 
286  return 0;
287 }
288 
289 /*
290 
291 Q: What is FOUR_WAY_MAX_WITH_REDUNDANCY?
292 
293 A: The original description of the Nussinov algorithm (#1 below) is equivalent to the simpler version (#2 below, also Allie's thesis).
294  (Note that all variants define N(i,j) for 1 <= i < j <= L; use N=0 when reading other elements.)
295 
296  This seems to have essentially zero performance impact, as it is in the O(L^2) rather than O(L^3) part of the code
297 
298 ----- 1. Original Nussinov -----
299 
300 (as described on http://ultrastudio.org/en/Nussinov_algorithm)
301 
302 N(i, j) = max(
303  N(i+1, j )
304  N(i , j-1)
305  N(i+1, j-1) + w(i, j)
306  max_{i< k<j}(N(i, k) + N(k+1, j)
307 )
308 
309 Note that N(i , j-1) is redundant, considering two cases after noting i<j from context, so i<=j-1:
310 Case 1: i = j-1, i.e., the maximal value for any column
311  Here, N(i, j-1) is N(i, i), which is 0 and can be ignored
312 
313 Case 2: i < j-1
314  here, N(i, j-1) is the same the max term's biggest k, i.e. k=j-1 gives N(i, j-1)+N(j-1+1, j) and the N(j,j) term is 0
315 
316 
317 ----- 2. Simpler Variant (possibly known as Nussinov-Jacobsen?) -----
318 
319 http://www.ibi.vu.nl/teaching/masters/prot_struc/2008/ps-lec12-2008.pdf
320 
321 N(i, j) = max(
322  N(i+1, j-1) + w(i, j)
323  max_{i<=k<j} N(i, k) + N(k+1, j)
324 )
325 
326 break out the "=" from i<=k:
327 
328 N(i, j) = max(
329  N(i+1, j-1) + w(i, j)
330  max_{i=k<j} N(i, k) + N(k+1, j)
331  max_{i< k<j} N(i, k) + N(k+1, j)
332 )
333 
334 rewrite k's as i's in i=k version, drop i<j due to redundancy with context
335 
336 N(i, j) = max(
337  N(i+1, j-1) + w(i, j)
338  N(i, i) + N(i+1, j)
339  max_{i< k<j} N(i, k) + N(k+1, j)
340 )
341 
342 Note all N(i, i) are zero:
343 
344 N(i, j) = max(
345  N(i+1, j-1) + w(i, j)
346  N(i+1, j)
347  max_{i< k<j} N(i, k) + N(k+1, j)
348 )
349 
350 See also http://www.pnas.org/content/77/11/6309.full.pdf
351 
352  */
#define MAX_N(x, y, v)
#define MAX_SIZE
Definition: tb.c:9
int score
char base
This version is stamped on May 10, 2016.
Definition: nussinov.c:24
#define PRINT_SIZE
Definition: Nussinov.orig.c:90
#define N(x, y)
char * base_print[4]
score N_array[MAX_SIZE][MAX_SIZE]
#define eassert(EXPR)
Definition: Nussinov.orig.c:65
score match(base b1, base b2)
base seq[MAX_SIZE]
int main(int argc, char *argv[])
#define debug_N(x, y)
int bool
This version is stamped on May 10, 2016.
Definition: Nussinov.orig.c:50
static const uint32_t k[]
Definition: sha-256.c:22
#define ZERO_IF_NO_REDUNDANCY
Definition: Nussinov.orig.c:86
score max_score(score s1, score s2)
int result[SIZE]
Definition: adpcm.c:800
int base
#define VERBOSE_OUT
Definition: Nussinov.orig.c:96
TVMArray b2[1]
#define MAX_N_DECLS()
void eassert_func(char *expr, bool value)
Definition: Nussinov.orig.c:66
TVMArray b1[1]
int getint(char *prompt)
#define MAX_N_START(x, y)
#define MAX_N_END(x, y)
#define SEED
double cur_time(void)

Generated on Mon Feb 12 2024 13:02:50 for PandA-2024.02 by doxygen 1.8.13