当前位置：首页 > news >正文

工作记录—DUYAO-JIEYAO系统进化与单倍型分析

news 2025/7/6 12:44:32

1.数据搜集

代码：基于爬虫批量下载

我的尊严不允许我手点一个个下载，爬虫！

import requestsdef download_file(url):# 从URL中获取文件名filename = url.split('/')[-1]# 下载文件response = requests.get(url)# 保存文件with open(f"../data/{filename}", 'wb') as f:f.write(response.content)print(f"Downloaded: {filename}")def download1(page):cookies = {
#}headers = {
#}params = {'pageNumber': f'{page}','pageSize': '20','speciesName': '','_': '1732209378705',}response = requests.get('http://www.ricesuperpir.com/web/download/search',params=params,cookies=cookies,headers=headers,verify=False,)dic = response.json()['rows'][:]print(dic)for i in dic:Annotation = "http://www.ricesuperpir.com" + i['geneAnnotationUrl']genome = "http://www.ricesuperpir.com" + i['genomeSequenceUrl']print("正在下载：", Annotation, genome)download_file(Annotation)download_file(genome)for page in range(1, 14):download1(page)

拿到数据之后：
发现数据库里序列都是FASTA、里面还有一些ctg这种的，没办法从注释里面直接提取序列。
打算用DK24、TGR4这两个标记做ePCR去所有的FASTA里提取。

显示简单用CW01做了一下blast测试

代码：简单BLAST

如果只是做blast来测试看看标记情况，用下面这段代码：会建库之后进行blast，并保存结果

import os
import subprocessimport pandas as pd# 指定BLAST安装路径和执行命令
blast_path = r"E:\softWare\blast-2.16.0+\bin"
blastn_exe = os.path.join(blast_path, "blastn.exe")
makeblastdb_exe = os.path.join(blast_path, "makeblastdb.exe")def make_blast_db(input_fasta, dbtype="nucl"):"""创建BLAST数据库Args:input_fasta: 输入的FASTA文件路径dbtype: 数据库类型,默认为nucl(核酸)"""makeblastdb_command = [makeblastdb_exe, "-in", input_fasta, "-dbtype", dbtype, "-parse_seqids"]subprocess.run(makeblastdb_command)def run_blast_search(query_file, database, output_file):"""执行BLAST搜索Args:query_file: 查询序列文件database: BLAST数据库文件output_file: 输出文件路径"""blast_command = [blastn_exe, "-query", query_file, "-db", database, "-out", output_file, "-outfmt", "10", "-evalue","1", "-word_size", "20", "-task", "blastn-short"]return subprocess.run(blast_command)def process_blast_results(blast_output, genome_file):"""处理BLAST结果并提取对应序列"""results = {}# 读取BLAST结果blastResult = pd.read_csv(blast_output, header=None)# 设置列名blastResult.columns = ['marker', 'chrom', 'identity', 'length', 'mismatches', 'gaps', 'q_start', 'q_end', 's_start','s_end', 'evalue', 'bitscore']"""#这里是一个奇怪的功能  如果一个标记匹配到多条染色体 就筛选使用匹配次数最多的染色体的结果# 找出最常见的染色体编号most_common = blastResult['chrom'].mode()[0]  # 修改这里：使用列名而不是索引print(f"Most common chromosome: {most_common}")# 过滤数据blastResult = blastResult[blastResult['chrom'] == most_common]"""blastResult = blastResultreturn blastResultif __name__ == "__main__":query_file = r"marker_fasta.fa"genome_file = r"dataForTest/CW01.cr.fa"output_file = "blast_results.csv"# 1. 建立BLAST数据库make_blast_db(genome_file)# 2. 运行BLAST搜索blast_result = run_blast_search(query_file, genome_file, output_file)print("BLAST search completed with status:", blast_result)# 3. 处理结果并提取序列try:results = process_blast_results(output_file, genome_file)results.to_csv(output_file, index=False)except Exception as e:print(f"Error processing results: {e}")

代码：基因序列提取（完善度稍好一些）

同时最终的目的是拿到目的片段，所以这段代码在理想情况下（引物匹配到且专一性较强）可以获取指定自交系理论上的RHS12核酸序列，同时也保留blast结果

2024年11月24日15:08:20

但是后期做单倍型至少要用CDS （蛋白）

这里虽然硬把基因序列提出来了但是没啥用

import csv
import os
import subprocess
from typing import Dictimport pyfaidx
from Bio.Seq import Seqdef make_blast_db(input_fasta: str, blast_path: str = r"E:\softWare\blast-2.16.0+\bin"):"""创建BLAST数据库Args:input_fasta: 输入的FASTA文件路径blast_path: BLAST程序所在目录"""makeblastdb_exe = os.path.join(blast_path, "makeblastdb.exe")cmd = [makeblastdb_exe, "-in", input_fasta, "-dbtype", "nucl", "-parse_seqids"]result = subprocess.run(cmd, capture_output=True, text=True)if result.returncode != 0:raise Exception(f"makeblastdb失败: {result.stderr}")def run_blast_search(query_file: str, database: str, output_file: str,blast_path: str = r"E:\softWare\blast-2.16.0+\bin") -> None:"""运行BLAST搜索Args:query_file: 引物序列文件database: 基因组数据库文件output_file: 输出文件路径blast_path: BLAST程序所在目录"""blastn_exe = os.path.join(blast_path, "blastn.exe")cmd = [blastn_exe, "-query", query_file, "-db", database, "-out", output_file, "-outfmt", "10",  # CSV格式"-evalue", "1", "-word_size", "20", "-task", "blastn-short"  # 适用于短序列比对]result = subprocess.run(cmd, capture_output=True, text=True)if result.returncode != 0:raise Exception(f"BLAST搜索失败: {result.stderr}")class FastaExtractor:def __init__(self, fasta_file: str):"""初始化序列提取器"""self.fasta_index = pyfaidx.Fasta(fasta_file)def extract_sequence(self, chrom: str, start: int, end: int, reverse_complement=False) -> str:"""提取指定区域的序列Args:chrom: 染色体名start: 起始位置end: 终止位置reverse_complement: 是否需要反向互补"""# 确保start小于endextract_start = min(start, end)extract_end = max(start, end)sequence = str(self.fasta_index[chrom][extract_start - 1:extract_end])# 如果原始的start大于end，或者明确要求反向互补，则进行反向互补if reverse_complement or start > end:sequence = str(Seq(sequence).reverse_complement())return sequencedef simulate_epcr(genome_file: str, blast_result_file: str, max_product_size: int = 100000) -> Dict:"""模拟ePCR过程"""# 解析BLAST结果forward_pos, reverse_pos = None, Nonewith open(blast_result_file, 'r') as f:reader = csv.reader(f)for row in reader:if len(row) != 12:continuemarker, chrom, identity = row[0:3]s_start, s_end = int(row[8]), int(row[9])if marker.endswith('_F'):forward_pos = (chrom, s_start, s_end)elif marker.endswith('_R'):reverse_pos = (chrom, s_start, s_end)if not (forward_pos and reverse_pos):print("错误: 未找到完整的引物对")return None# 验证引物位置if forward_pos[0] != reverse_pos[0]:print("错误: 引物不在同一条染色体上")return None# 获取产物范围chrom = forward_pos[0]product_start = min(forward_pos[1], forward_pos[2], reverse_pos[1], reverse_pos[2])product_end = max(forward_pos[1], forward_pos[2], reverse_pos[1], reverse_pos[2])product_size = product_end - product_start + 1if product_size > max_product_size:print(f"警告: 产物大小({product_size}bp)超过最大限制({max_product_size}bp)")return None# 提取序列try:extractor = FastaExtractor(genome_file)# 提取正向引物序列forward_seq = extractor.extract_sequence(chrom, forward_pos[1], forward_pos[2], reverse_complement=False# BLAST已经给出了正确的方向)# 提取反向引物序列reverse_seq = extractor.extract_sequence(chrom, reverse_pos[1], reverse_pos[2], reverse_complement=False# BLAST已经给出了正确的方向)# 提取完整产物序列product_seq = extractor.extract_sequence(chrom, product_start, product_end)return {'chromosome': chrom, 'product_start': product_start, 'product_end': product_end,'product_size': product_size,'forward_primer': {'start': forward_pos[1], 'end': forward_pos[2], 'sequence': forward_seq,'is_reverse': forward_pos[1] > forward_pos[2]},'reverse_primer': {'start': reverse_pos[1], 'end': reverse_pos[2], 'sequence': reverse_seq,'is_reverse': reverse_pos[1] > reverse_pos[2]}, 'product_sequence': product_seq}except Exception as e:print(f"提取序列时出错: {e}")return Nonedef run_epcr_pipeline(primer_file: str, genome_file: str, blast_path: str = r"E:\softWare\blast-2.16.0+\bin"):"""运行完整的ePCR流程Args:primer_file: 引物序列文件genome_file: 基因组文件blast_path: BLAST程序路径"""blast_result_file = "blast_results.csv"try:# 1. 创建BLAST数据库print("正在创建BLAST数据库...")make_blast_db(genome_file, blast_path)# 2. 运行BLAST搜索print("正在运行BLAST搜索...")run_blast_search(primer_file, genome_file, blast_result_file, blast_path)# 3. 进行ePCR模拟print("正在模拟ePCR...")result = simulate_epcr(genome_file, blast_result_file)if result:print("\nePCR 结果:")print(f"位置: {result['chromosome']}:{result['product_start']}-{result['product_end']}")print(f"产物大小: {result['product_size']} bp")print("\n正向引物:")print(f"位置: {result['forward_primer']['start']}-{result['forward_primer']['end']}")print(f"序列: {result['forward_primer']['sequence']}")print(f"是否反向: {result['forward_primer']['is_reverse']}")print("\n反向引物:")print(f"位置: {result['reverse_primer']['start']}-{result['reverse_primer']['end']}")print(f"序列: {result['reverse_primer']['sequence']}")print(f"是否反向: {result['reverse_primer']['is_reverse']}")print("\n产物序列:")print(result['product_sequence'])return resultexcept Exception as e:print(f"运行过程中出错: {e}")return Noneif __name__ == "__main__":primer_file = "marker.fasta"  # 包含正向和反向引物的FASTA文件genome_file = "genomes\\CW01.cr.fa"result = run_epcr_pipeline(primer_file, genome_file)

2024年11月24日 —— 问题

从标记的情况来看，是有可能可以得到大部分DK24、TGR4中间的核酸序列的物理位置的，但是最后做单倍型分析等等肯定需要蛋白序列或者CDS

往往上述位置中都有多个基因

可以通过gff把这些注释提取出来，应该可以获得多个CDS的组合

按照位置截取并提取CDS会导致最终蛋白出现问题（不是3的倍数没有终止子）

所以就重新调整取开区间（只要标记位置和一个基因内的CDS区间有重叠就取这个基因的全部CDS）

由于TBtools中的功能没有办法批量做所以这段只能自己复现然后和TBtools里面比对

代码：完整CDS区域的提取

最后修改时间 2024年11月27日01:21:26

今天随着运行更多文件，出现了更多bug，

1.gff里的文件里提取cds不能按照默认顺序提取，需要按物理位置重新排序一下，不然拼接出的一个区间内的cds序列就是乱的

2.要注意序列 - +
if strand == '-':cds_seq = cds_seq.reverse_complement()
3.解决了取开区间过程中造成的错误，否则会直接导致蛋白质翻译全部出错
4.白天本来连怎么分析蛋白都尝试很多了，结果发现两个标记在251 (202 O. sativa, 28 O. rufipogon, 11 O. glaberrima, and 10 O. barthii)和33个栽培品种中匹配度都很好，但是在野生型中几乎匹配不到（卡住了）

import gzip
import os
import subprocessimport pandas as pd
from Bio import SeqIOunMatched = []class GetPosition:"""用于通过BLAST搜索获取序列位置信息的类"""def __init__(self, blast_path: str = r"E:\softWare\blast-2.16.0+\bin"):self.output_cds = Noneself.blast_path = blast_pathself.blastn_exe = os.path.join(blast_path, "blastn.exe")self.makeblastdb_exe = os.path.join(blast_path, "makeblastdb.exe")self.genome_file = Noneself.output_file = Noneself.gff_file = Noneself.blastResultFold = "blastResult"def set_files(self, genome_file: str, gff_file: str, output_file: str, output_cds) -> None:"""设置输入输出文件路径"""self.genome_file = genome_fileself.output_file = output_fileself.gff_file = gff_fileself.output_cds = output_cdsdef make_blast_db(self, dbtype: str = "nucl") -> None:"""创建BLAST数据库"""if self.genome_file.endswith('.gz'):with gzip.open(self.genome_file, 'rb') as f_in:with open(self.genome_file[:-3], 'wb') as f_out:f_out.write(f_in.read())self.genome_file = self.genome_file[:-3]makeblastdb_command = [self.makeblastdb_exe, "-in", self.genome_file, "-dbtype", dbtype, "-parse_seqids"]subprocess.run(makeblastdb_command, check=True)def run_blast_search(self) -> None:"""执行BLAST搜索"""blast_command = [self.blastn_exe, "-query", "marker_fasta.fa", "-db", self.genome_file, "-out",self.output_file,"-outfmt", "10", "-evalue", "2", "-word_size", "20", "-task", "blastn-short"]subprocess.run(blast_command, check=True)def process_blast_results(self):"""处理BLAST结果并提取位置信息"""blast_result = pd.read_csv(self.output_file, header=None)blast_result.columns = ['marker', 'chrom', 'identity', 'length', 'mismatches', 'gaps', 'q_start', 'q_end','s_start', 's_end', 'evalue', 'bitscore']if blast_result.shape[0] == 4:chrom = blast_result['chrom'].mode()[0]# 合并所有start和end坐标并排序all_positions = sorted(blast_result['s_start'].tolist() + blast_result['s_end'].tolist())# 取第4大作为start,第5大作为endpositions_start = all_positions[3]positions_end = all_positions[4]return chrom, positions_start, positions_endelse:unMatched.append(self.genome_file)print(self.genome_file)return Nonedef get_cds_sequences(self, chrom, start, end):if self.gff_file.endswith('.gz'):with gzip.open(self.gff_file, 'rb') as f_in:with open(self.gff_file[:-3], 'wb') as f_out:f_out.write(f_in.read())self.gff_file = self.gff_file[:-3]# 读取和过滤GFF数据gff_data = pd.read_csv(self.gff_file, comment='#', sep='\t', header=None)cds_records = gff_data[(gff_data.iloc[:, 0] == chrom) &(gff_data.iloc[:, 2] == 'CDS') &(gff_data.iloc[:, 3] >= start) &(gff_data.iloc[:, 4] <= end)&gff_data.iloc[:, 8].str.contains('T01')#cultivated]# 获取并过滤attributesattributes = {i.split(';Parent=')[-1] for i in cds_records.iloc[:, 8]}cds_records = pd.concat([gff_data[(gff_data.iloc[:, 8].str.contains(i)) &(gff_data.iloc[:, 0] == chrom) &(gff_data.iloc[:, 2] == 'CDS')]for i in attributes])# cds_records = gff_data[gff_data.iloc[:, 8].isin(attributes)]#251# cds_records = gff_data[gff_data.iloc[:, 8].str.contains('T01') & gff_data.iloc[:, 8].isin(attributes)]#wildcds_records = cds_records.sort_values(by=3)print(cds_records)# 读取基因组序列genome_seq = Nonefor record in SeqIO.parse(self.genome_file, "fasta"):if record.id == chrom:genome_seq = record.seqbreakif genome_seq is None:exit(911)# 提取并拼接CDS序列cds_sequences = []for _, cds in cds_records.iterrows():cds_start = cds.iloc[3] - 1  # -1 ?cds_end = cds.iloc[4]strand = cds.iloc[6]cds_seq = genome_seq[cds_start:cds_end].strip()if strand == '-':cds_seq = cds_seq.reverse_complement()# len_1 = len(cds_seq)cds_sequences.append(str(cds_seq))return ''.join(cds_sequences)def run_complete_analysis(self):"""执行完整的分析流程"""try:self.make_blast_db()self.run_blast_search()results = self.process_blast_results()chrom, start, end = resultscds_sequence = self.get_cds_sequences(chrom, start, end)self.output_cds[0].write(">" + self.output_cds[1] + "\n" + cds_sequence + "\n")print(cds_sequence)return resultsexcept Exception as e:print(f"分析过程中发生错误: {e}")return Nonedef main():"""主函数示例"""finder = GetPosition()rices = set()output_cds = open("CDSresult/cdsFasta.fa", "a")# rice_mold = "data_cultivated"rice_mold = "data_cultivated"# fa_gff = ['.cr.fa.gz', '.gff3.gz']# fa_gff = ['.fasta', '.fasta.gff3']fa_gff = ['.genome', '.IGDBv1.Allset.gff']for i in os.listdir(f"{rice_mold}"):rices.add(i.split(".")[0])for i in sorted(rices):finder.set_files(genome_file=rf"{rice_mold}/{i}{fa_gff[0]}",gff_file=rf"{rice_mold}/{i}{fa_gff[1]}",output_file=rf"blastResult/{rice_mold}_{i}_blast_results.csv",output_cds=[output_cds, i])results = finder.run_complete_analysis()# finder.cleanup_temp_files()if results:chrom, start, end = resultsprint(f"分析完成! 标记位置信息: chrom={chrom},start={start}, end={end}")else:print("分析未能获取有效结果")output_cds.close()if __name__ == "__main__":main()print(unMatched)

终于能获得正常的蛋白了测试用的CW001没有提前终止蛋白末尾刚好有一个终止子

氨基酸数量也对

补充：单个材料基于GFF/gff3的CDS提取

直接在tbtools里做就很方便了，可以很方便的提取出所有基因的CDS，拼接

代码 CDS-protein的转化

# 使用Python的Biopython包
from Bio.Seq import SeqcdsFile = open("cdsFasta.fa", "r")
proteinFile = open("proteinFasta.fa", "a")for line in cdsFile.readlines():if ">" in line:print(line)proteinFile.write(line)else:proteinFile.write(str(Seq(line.strip()).translate()) + "\n")

tbtools也很容易做，这个可以批量做，主要是用来对比了一下自己的代码结果是否正确

2024年11月27日01:49:15

解决了位置问题之后通过motif大概看出来区间是对了的

现在一个是要解决之后的单倍型怎么做

一个是正常手段的标记卡不出来的自交系怎么解决、

motif memesuite做的啥用没有不知道这个蓝的到处跳是不是我的代码又写错了还是真的变异

单倍型分析

1.对齐

mafft --auto proteinFasta_2.fa > mafft_1.fasta

2.maga里比对

3.Compute Amino Acid Composition"，这将显示各个序列中氨基酸的组成比例和分布情况。这有助于初步了解序列的基本特征和可能的差异。

4.模型选择

7950X很久没这样动过了挺暖手的

模型比较解读： WAG+G+F模型显示出最佳拟合效果，这一结论基于以下观察：

该模型具有最低的BIC值（66160.482）和AICc值（65301.827）
虽然WAG+G+I+F模型参数略多（94个vs 93个），但其BIC和AICc值略高，表明额外参数并未带来显著改善

拟合模型说明：

WAG代表Whelan And Goldman模型，是专门为蛋白质序列设计的替换模型
+G表示考虑了位点间替换率的gamma分布
+F代表使用观察到的氨基酸频率
+I代表考虑了不变位点的比例

5.使用最优模型进行系统发育分析tree和tree文件

6.TBtools结果

根本上还是序列太长了，怎么获取有效序列是最大的问题

2024年11月29日00:24:20

全面的测试了每个文章中标记的可用性情况，在野生型、育成品种中提取CDS进行protein预测

即使在RHS12中可以提取到大部分的位点，获得的也是一个长度大概2000~3000的蛋白序列，会得到一个非常冗长的蛋白序列，难以识别有效的单倍型。

——后续转向尝试更小的阅读框来分析单倍型

qHMS7、S5、RHS12提供的标记在33个栽培品种和地方品种中表现好，可以通过blast获取有效区间，从而提取CDS。

S1提供的标记效果非常不好，在大部分品种中都不匹配，可能与引物过长有关。

因此，当务之急还是先解决序列的问题，

对于不完全匹配的标记，可以采取降低e-value然后通过一些简单逻辑来得到正确区间。

对于完全匹配不到的标记：

1.考虑拿到文章定位所使用的fa，定位到正确序列重新设计多对引物，用于解决多个基因组内找不到共同保守序列的问题

2.由于qHMS7、S5、RHS12都有提供蛋白相关的序列与各单倍型序列、且蛋白质序列通常比核苷酸序列更保守，可能会尝试用blastP来进行匹配或者直接从在数据库蛋白文件中找多种单倍型。（需要把所有数据库全部做CDS的提取、全部转化为蛋白序列）