понедельник, 3 апреля 2017 г.

Певзнер. Задачи.

 2.1) Разделиться список на два (большие и маленькие значения) сравнивая исходный по парам ([n/2]). Находим мин в списке с маленькими значениями и Макс в списке с большими.

2.2)

m = [8, 4, 6]

res = ''for i0 in range(m[0]+1):
    res += str(i0)
    for i1 in range(m[1]+1):
        res += str(i1)
        for i2 in range(m[2] + 1):
            res += str(i2)
print(res)

def recursion(iter_num, iter_val, res):
    if iter_num < len(m):
        if iter_val <= m[iter_num]:
            res += str(iter_val)
            res = recursion(iter_num + 1, 0, res)
            return recursion(iter_num, iter_val + 1, res)
        else:
            return res
    else:
        return res

print(recursion(0, 0, ''))
 
 
2.3) да нет нет 

2.17) бактерий на k-ом шаге будет 2^k n - 2^k k. Отсюда видно, что через n шагов все бактерии будут съеден


2.18) как? 99*2 только если сразу попадется правдивей

2.19)  

воскресенье, 26 февраля 2017 г.

Enumerating k-mers Lexicographically (ROSALIND LEXF)

Given: A collection of at most 10 symbols defining an ordered alphabet, and a positive integer n (n≤10).

Return: All strings of length n that can be formed from the alphabet, ordered lexicographically.

s = input()
n = int(input())

alphabet = s.split(' ')

def cycle(res, k):
    if k > 1:
        new_res = []
        for r in range(len(res)):
            for i in alphabet:
                new_res.append(res[r] + i)
        return cycle(new_res, k-1)
    else:
        return res

for i in cycle(alphabet, n):
    print(i)

Enumerating Oriented Gene Orderings (ROSALIND SIGN)

Given: A positive integer n≤6.

Return: The total number of signed permutations of length n, followed by a list of all such permutations (you may list the signed permutations in any order).

import copy
import math


def permutations(my_set, perm):
    if len(my_set) > 0:
        res = []
        for s in my_set:
            new_my_set = copy.deepcopy(my_set)
            new_my_set.remove(s)
            for j in permutations(new_my_set, perm + str(s)):
                res.append(j)
        return res
    else:
        return [perm]


def lenn_sets(my_set, res, k):
    if k > 1:
        new_res = []
        for r in range(len(res)):
            for i in my_set:
                new_res.append(res[r] + i)
        return lenn_sets(my_set, new_res, k-1)
    else:
        return res


n = 6r1 = permutations(list(range(1,n+1)), '')
my_set2 = ['+', '-']
r2 = lenn_sets(my_set2, copy.deepcopy(my_set2), n)
f = open('sign.txt', 'w')
f.write(str(int(math.factorial(n)*math.pow(2, n)))+'\n')
for r_1 in r1:
    for r_2 in r2:
        l = ''        for i in range(n):
            l += r_2[i] + r_1[i] + ' '        f.write(l.replace('+', '')+'\n')
f.close()

четверг, 23 февраля 2017 г.

Locating Restriction Sites (ROSALIND REVP)

Given: A DNA string of length at most 1 kbp in FASTA format.

Return: The position and length of every reverse palindrome in the string having length between 4 and 12.

s = input()

def complementary(a, b):
    if (a == 'A' and b == 'T') or (a == 'T' and b == 'A') or (a == 'G' and b == 'C') or (a == 'C' and b == 'G'):
        return True    else:
        return False

res = []
for i in range(1, 6):
    if complementary(s[i - 1], s[i]):
        j = 1        while j < i:
            if complementary(s[i - 1 - j], s[i + j]):
                res.append([i-j, 2 * (j+1)])
                j += 1            else:
                breakfor i in range(6, len(s)-5):
    if complementary(s[i - 1], s[i]):
        j = 1        while j < 6:
            if complementary(s[i - 1 - j], s[i + j]):
                res.append([i - j, 2 * (j+1)])
                j += 1            else:
                breakfor i in range(len(s)-5, len(s)-1):
    if complementary(s[i - 1], s[i]):
        j = 1        while j < len(s)-i:
            if complementary(s[i - 1 - j], s[i + j]):
                res.append([i - j, 2 * (j+1)])
                j += 1            else:
                breakfor r in res:
    print(''.join(str(r[0])) + ' ' + ''.join(str(r[1])))

среда, 22 февраля 2017 г.

RNA Splicing (ROSALIND SPLC)

Given: A DNA string s (of length at most 1 kbp) and a collection of substrings of s acting as introns. All strings are given in FASTA format.

Return: A protein string resulting from transcribing and translating the exons of s. (Note: Only one solution will exist for the dataset provided.)

import re
import rosalind_lib

f = open('splc.txt', 'r')
strings = re.findall(r'(>Rosalind_[0-9]+)\n(([A-Z]+\n)+)', f.read())
rna = strings.pop(0)[1].replace('\n','').replace('T', 'U')
introns = {}
for s in strings:
    intron = s[1].replace('\n','').replace('T', 'U')
    rna = rna.replace(intron,'')
print(rosalind_lib.rna_to_protein(rna))



понедельник, 20 февраля 2017 г.

Find a substring

t = '010's = '0111010100001'print(s.find(t)) - first position

Common problem: we have string s (usually very long) and string t (usually short). Find positions, where t is a substring of s.
Example: for s = 'TATGCAATGC' and t = 'TGC' positions are: 2, 7.

Trere are several algorithms to find them: wikipedia.
I'll show here two easiest ways:
naive algorithm (find all pos):
def substring_positions(s,t):
    res = []
    for i in range(len(s) - len(t) + 1):
        flag = True        for j in range(len(t)):
            if s[i + j] != t[j]:
                flag = False                break        if flag:
            res.append(str(i))
    return res

knuth-morris-pratt algorithm:
def kmp(s,t, limit = 1):
    # building prefix table pt for s    # it's i-th element shows, how many steps go backward on s    # if there will be a mismatch with t string    pt = [0]
    for m in s[1:]:
        j = pt[-1]
        while j > 0 and m != s[j]:
            j = pt[j-1]
        if m == s[j]:
            j += 1        pt.append(j)
    # walk through the s    res = []
    j = 0 # number or symbols matched with t    for i in range(len(s)):
        while j > 0 and s[i] != t[j]:
            j = pt[j - 1]
        if s[i] == t[j]:
            j += 1        if j == len(t):
            res.append(i - j + 1)
            if len(res) == limit:
                return res
            j = 0    return res
 

среда, 15 февраля 2017 г.

Bioinformatics Contest 2017 (task1)

Everybody knows that a huge number of different chemical reactions happen in cells. A reaction takes as an input a set of chemicals (substrates) and converts them to another set (products). Here we consider all reactions to be one-directional. A substrates for a reaction could be chemicals from the environment or products from other reactions. 
A scientist John Doe was given a cell and a list of chemicals that are present in the environment at the beginning. He already knows what reactions could happen in the cell. You should help him to understand which chemicals could appear in the cell.

Input Format


The first line contains initial chemicals separated by spaces. There is always at least one initial chemical. Each chemical is represented by a random positive integer that fits in 4-byte integer type. Each of the following lines describes a reaction, one per line. Each reaction is presented as two lists of integers separated by '->': the list of chemicals, separated by '+', that is needed to perform a reaction and the list of chemicals, separated by '+', that are produced as a result of the reaction. Each chemical could be present in each reaction maximum 2 times: one time at the left part and the other time at the right part (for example, a catalyst could appear in both parts).
Constraints for the easy version: a total number of chemicals through all reactions does not exceed 103
.
Constraints for the hard version: a total number of chemicals through all reactions does not exceed 105.

import sys


def chem_recorder(data_set):
    while len(data_set) > 0:
        new_data_set = set()
        for add_chem in data_set:
            if add_chem not in chem_in_cell:
                chem_in_cell.add(add_chem)
                if add_chem in chems_lines:
                    counter = 0                    for chem_line in chems_lines[add_chem]:
                        chem_line[0].remove(add_chem)
                        if len(chem_line[0].difference(chem_in_cell)) == 0:
                            new_add = chem_line[1].difference(chem_in_cell)
                            new_data_set = new_data_set.union(new_add)
                            chems_lines[add_chem].pop(counter)
                        counter += 1        data_set = new_data_set.copy()


all_lines = sys.stdin.readlines()
in_cell = all_lines.pop(0)
chem_in_cell = set(in_cell.strip().split(' '))
xxx = set(in_cell.strip().split(' '))
chems_lines = {} # 'chem' => [[{left_line1}, {right_line1}], [{left_line2}, {right_line2}]]for line in all_lines:
    line = line.strip()
    l1 = line.split('->')
    left_line = set(l1[0].split('+'))
    right_line = set(l1[1].split('+'))
    not_in_cell_chem = left_line.difference(chem_in_cell)
    if len(not_in_cell_chem) > 0:
        for nic_chem in not_in_cell_chem:
            if nic_chem in chems_lines:
                flag = 0                for chem_line in chems_lines[nic_chem]:
                    if chem_line[0] == not_in_cell_chem:
                        chem_line[1] = chem_line[1].union(right_line)
                        flag = 1                        break                if flag == 0:
                    chems_lines[nic_chem].append([not_in_cell_chem, right_line])
            else:
                chems_lines[nic_chem] = [[not_in_cell_chem, right_line]]
    else:  # gote all chems        chem_recorder(right_line)
chem_recorder(xxx)
print(' '.join(chem_in_cell))

Bioinformatics Contest 2017 (task2)

  • RNA consists of one strand (chain of ribonucleotides) most of the time, and sometimes it can fold into the secondary structure. In this problem we consider a simpler secondary form than in real life. In this form, some pairs of complementary ribonucleotides build hydrogen bonds between them and each ribonucleotide can build at most one bond. Each bond can be represented as a pair of indices (i,j) (i<j) of the string S that represents RNA. These indices indicate the positions of the symbols in S corresponding to these ribonucleotides.
    Also, in the secondary structure one additional condition holds for each pair of hydrogen bonds (i1,j1), (i2,j2), such that i1<i2:
    • they do not intersect i1<j1<i2<j2))
    • or the second bond is inside the first one (i1<i2<j2<j1).
My first solution:
import sys
import math


def norm(str1, str2):
    return math.fabs(int(str1) - int(str2))


def check_perfect(s):
    len_s = len(s)
    loops_tops = []
    for i in range(1, len_s):
        if norm(s[i], s[i - 1]) == 1:
            loops_tops.append(i)
    if len(loops_tops) == 0:
        return False    for start in loops_tops:
        step_right = 0        step_left = 1        move = True        while move:
            right = int(s[start + step_right])
            left = int(s[start - step_left])
            if (right == 9) or (left == 9):
                move = False            elif norm(left, right) != 1:
                move = False            else:
                s[start + step_right] = '9'                s[start - step_left] = '9'            if ((start + step_right + 1) >= len_s) or ((start - step_left - 1) < 0):
                move = False            step_right += 1            step_left += 1    new_s = [x for x in s if x != '9']
    len_new_s = len(new_s)
    if len_new_s == 0:
        return True    else:
        if len_new_s == len_s:
            return False        else:
            return check_perfect(new_s)


def check_perfect_if_one_hole(s):
    len_s = len(s)
    if len_s == 1:
        return True    holed_tops = []
    for i in range(2, len_s):
        if norm(s[i], s[i - 2]) == 1:
            holed_tops.append(i - 1)
    for hole in holed_tops:
        s1 = s.copy()
        del s1[hole]
        if check_perfect(s1):
            return True    loops_tops = []
    for i in range(1, len_s):
        if norm(s[i], s[i - 1]) == 1:
            loops_tops.append(i)
    if len(loops_tops) == 0:
        return False    for start in loops_tops:
        step_right = 0        step_left = 1        move = True        while move:
            right = int(s[start + step_right])
            left = int(s[start - step_left])
            if (right == 9) or (left == 9):
                move = False            elif norm(left, right) != 1:
                if (start + step_right + 1) < len_s:
                    if norm(int(s[start + step_right + 1]), left) == 1:
                        s1 = s.copy()
                        del s1[start + step_right]
                        if check_perfect(s1):
                            return True                if (start - step_left - 1) > -1:
                    if norm(right, int(s[start - step_left - 1])) == 1:
                        s1 = s.copy()
                        del s1[start - step_left]
                        if check_perfect(s1):
                            return True                move = False            else:
                s[start + step_right] = '9'                s[start - step_left] = '9'            if ((start + step_right + 1) >= len_s) or ((start - step_left - 1) < 0):
                move = False            step_right += 1            step_left += 1    new_s = [x for x in s if x != '9']
    len_new_s = len(new_s)
    if len_new_s == len_s:
        return False    else:
        return check_perfect_if_one_hole(new_s)



s0 = sys.stdin.readline()
s0 = s0.replace('\n', '')
s1 = s0.replace('U', '2')
s2 = s1.replace('A', '1')
s3 = s2.replace('G', '5')
s4 = s3.replace('C', '6')
s = list(s4)
lens = len(s)
if lens % 2 == 0:
    if check_perfect(s):
        print('perfect')
    else:
        print('imperfect')
else:
    if check_perfect_if_one_hole(s):
        print('almost perfect')
    else:
        print('imperfect')

Hint:
One could notice that the previous Dynamic Programming solution resembles the solutions for problems with bracket sequences. We could see a perfect string as a correct bracket sequence and we check the correctness of bracket sequences using stack!
So, to check if the string is perfect we iterate through bases and maintain a stack:
  1. If the current base is complementary to the base on top of the stack, then we pop from the stack.
  2. Otherwise, we put the current base into the stack.
If there is no base in the stack, then the given string is perfect.Now, how to understand that the string is almost perfect? The simplest solution will be to delete each symbol and check the resulting string to be perfect. Unfortunately, such solution works in O(|S|2)
time and does not fit into timelimit.
But the way to deal with it is quite easy. Let us perform the solution as in the case of the perfect string. We state that if the string is almost perfect then the base to delete is the middle base of the stack. We remove this base and check that the stack collapses. (In other words, we check that the stack has odd length and it holds a palindrome)

So, suitable solution is:

import sys

def complementary(a, b):
    if (a == 'A' and b == 'U') or (a == 'U' and b == 'A') or (a == 'G' and b == 'C') or (a == 'C' and b == 'G'):
        return True    else:
        return False
def go_stack(str):
    stack = []
    for i in str:
        if len(stack) > 0:
            if complementary(stack[-1], i):
                stack.pop()
            else:
                stack.append(i)
        else:
            stack.append(i)
    return stack


s = sys.stdin.readline().strip()
stack = go_stack(s)
len_stack = len(stack)
if len_stack == 0:
    print('perfect')
else:
    if len_stack % 2 == 0:
        print('imperfect')
    else:
        stack.pop(int((len_stack - 1)/2))
        stack2 = go_stack(stack)
        if len(stack2) == 0:
            print('almost perfect')
        else:
            print('imperfect')

воскресенье, 12 февраля 2017 г.

Naive implementation of a de-Bruijn graph

Думала я как собрать риды в контиг (для задачки). Помню, на лекциях все так восторженно рассказывали про граф Де Брюйна. Но почему так делать - это оптимально не объяснили. Ну и ладно, нашла я код под названием "naive implementation of a de-Bruijn graph". Ключевой момент там в ф-ии  get_contig_forward, она очень упрощена, не рассматривается случай раздвоения вариантов (когда надо выбрать более длинный контиг), не рассматривается случай повтороу участка кода длиннее k (да и как их обработать). Но сама обертка есть...

import collections
 
def twin(km):
    return Seq.reverse_complement(km)
 
def kmers(seq,k):
    for i in xrange(len(seq)-k+1):
        yield seq[i:i+k]
 
def fw(km):
    for x in 'ACGT':
        yield km[1:]+x
 
def bw(km):
    for x in 'ACGT':
        yield x + km[:-1]
 
def build(fn,k=31,limit=1):
   
   d = collections.defaultdict(int)

   with open('tests/1', 'r') as f:
        main_string = f.readline()
        for line in f.readlines()[2:]:
            line = line.strip()
                 for km in kmers(seq,k):
                    d[km] +=1
                 seq = twin(seq)
                 for km in kmers(seq,k):
                    d[km] += 1
 
    d1 = [x for x in d if d[x] <= limit]
    for x in d1:
        del d[x]
 
    return d
 
def get_contig_forward(d,km):
    c_fw = [km]
     
    while True:
        if sum(x in d for x in fw(c_fw[-1])) != 1:
            break
         
        cand = [x for x in fw(c_fw[-1]) if x in d][0]
        if cand in c_fw:
            break
         
        if sum(x in d for x in bw(cand)) != 1:
            break
 
        c_fw.append(cand)
 
    return c_fw
 
def get_contig(d,km):
    c_fw = get_contig_forward(d,km)
    c_bw = get_contig_forward(d,twin(km))
 
    c = [twin(x) for x in c_bw[-1:0:-1]] + c_fw
    s = c[0] + ''.join(x[-1] for x in c[1:])
    return s,c
 
def all_contigs(d):
    done = set()
    r = []
    for x in d:
        if x not in done:
            s,c = get_contig(d,x)
            for y in c:
                done.add(y)
                done.add(twin(y))
            r.append(s)
    return r
 
 
 
 
 
 

вторник, 17 января 2017 г.

Calculating Protein Mass (ROSALIND PRTM)

Given: A protein string P of length at most 1000 aa.

Return: The total weight of P. Consult the monoisotopic mass table.

def mass_table():
    mt = {}
    mt['A'] = 71.03711 
    mt['C'] = 103.00919 
    mt['D'] = 115.02694
    mt['E'] = 129.04259
    mt['F'] = 147.06841 
    mt['G'] = 57.02146 
    mt['H'] = 137.05891 
    mt['I'] = 113.08406 
    mt['K'] = 128.09496 
    mt['L'] = 113.08406 
    mt['M'] = 131.04049
    mt['N'] = 114.04293 
    mt['P'] = 97.05276 
    mt['Q'] = 128.05858 
    mt['R'] = 156.10111 
    mt['S'] = 87.03203 
    mt['T'] = 101.04768 
    mt['V'] = 99.06841 
    mt['W'] = 186.07931
    mt['Y'] = 163.06333 
    return mt

protein = input()
mt = mass_table()
mass = 0 
for aa in protein:
    mass += mt[aa]
print(round(mass,3))