Merge pull request #84 from rmcc3/main

Add support for DeepSeek language models in STORM Wiki pipeline
stanford-oval · Jul 19, 2024 · 61159c9 · 61159c9
2 parents 8751f05 + 04a44ce
commit 61159c9
Show file tree

Hide file tree

Showing 2 changed files with 246 additions and 0 deletions.
diff --git a/examples/run_storm_wiki_deepseek.py b/examples/run_storm_wiki_deepseek.py
@@ -0,0 +1,157 @@
+"""
+STORM Wiki pipeline powered by DeepSeek models and You.com or Bing search engine.
+You need to set up the following environment variables to run this script:
+    - DEEPSEEK_API_KEY: DeepSeek API key
+    - DEEPSEEK_API_BASE: DeepSeek API base URL (default is https://api.deepseek.com)
+    - YDC_API_KEY: You.com API key; or, BING_SEARCH_API_KEY: Bing Search API key
+
+Output will be structured as below
+args.output_dir/
+    topic_name/  # topic_name will follow convention of underscore-connected topic name w/o space and slash
+        conversation_log.json           # Log of information-seeking conversation
+        raw_search_results.json         # Raw search results from search engine
+        direct_gen_outline.txt          # Outline directly generated with LLM's parametric knowledge
+        storm_gen_outline.txt           # Outline refined with collected information
+        url_to_info.json                # Sources that are used in the final article
+        storm_gen_article.txt           # Final article generated
+        storm_gen_article_polished.txt  # Polished final article (if args.do_polish_article is True)
+"""
+
+import os
+import sys
+import re
+import logging
+from argparse import ArgumentParser
+
+from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
+from knowledge_storm.lm import DeepSeekModel
+from knowledge_storm.rm import YouRM, BingSearch
+from knowledge_storm.utils import load_api_key
+
+
+def sanitize_topic(topic):
+    """
+    Sanitize the topic name for use in file names.
+    Remove or replace characters that are not allowed in file names.
+    """
+    # Replace spaces with underscores
+    topic = topic.replace(' ', '_')
+
+    # Remove any character that isn't alphanumeric, underscore, or hyphen
+    topic = re.sub(r'[^a-zA-Z0-9_-]', '', topic)
+
+    # Ensure the topic isn't empty after sanitization
+    if not topic:
+        topic = "unnamed_topic"
+
+    return topic
+
+
+def main(args):
+    load_api_key(toml_file_path='secrets.toml')
+    lm_configs = STORMWikiLMConfigs()
+
+    # Ensure DEEPSEEK_API_KEY is set
+    if not os.getenv("DEEPSEEK_API_KEY"):
+        raise ValueError("DEEPSEEK_API_KEY environment variable is not set. Please set it in your secrets.toml file.")
+
+    deepseek_kwargs = {
+        'api_key': os.getenv("DEEPSEEK_API_KEY"),
+        'api_base': os.getenv("DEEPSEEK_API_BASE", "https://api.deepseek.com"),
+        'temperature': args.temperature,
+        'top_p': args.top_p,
+    }
+
+    # DeepSeek offers two main models: 'deepseek-chat' for general tasks and 'deepseek-coder' for coding tasks
+    # Users can choose the appropriate model based on their needs
+    conv_simulator_lm = DeepSeekModel(model=args.model, max_tokens=500, **deepseek_kwargs)
+    question_asker_lm = DeepSeekModel(model=args.model, max_tokens=500, **deepseek_kwargs)
+    outline_gen_lm = DeepSeekModel(model=args.model, max_tokens=400, **deepseek_kwargs)
+    article_gen_lm = DeepSeekModel(model=args.model, max_tokens=700, **deepseek_kwargs)
+    article_polish_lm = DeepSeekModel(model=args.model, max_tokens=4000, **deepseek_kwargs)
+
+    lm_configs.set_conv_simulator_lm(conv_simulator_lm)
+    lm_configs.set_question_asker_lm(question_asker_lm)
+    lm_configs.set_outline_gen_lm(outline_gen_lm)
+    lm_configs.set_article_gen_lm(article_gen_lm)
+    lm_configs.set_article_polish_lm(article_polish_lm)
+
+    engine_args = STORMWikiRunnerArguments(
+        output_dir=args.output_dir,
+        max_conv_turn=args.max_conv_turn,
+        max_perspective=args.max_perspective,
+        search_top_k=args.search_top_k,
+        max_thread_num=args.max_thread_num,
+    )
+
+    # STORM is a knowledge curation system which consumes information from the retrieval module.
+    # Currently, the information source is the Internet and we use search engine API as the retrieval module.
+    if args.retriever == 'bing':
+        rm = BingSearch(bing_search_api=os.getenv('BING_SEARCH_API_KEY'), k=engine_args.search_top_k)
+    elif args.retriever == 'you':
+        rm = YouRM(ydc_api_key=os.getenv('YDC_API_KEY'), k=engine_args.search_top_k)
+    else:
+        raise ValueError(f"Invalid retriever: {args.retriever}. Choose either 'bing' or 'you'.")
+
+    runner = STORMWikiRunner(engine_args, lm_configs, rm)
+
+    topic = input('Topic: ')
+    sanitized_topic = sanitize_topic(topic)
+
+    try:
+        runner.run(
+            topic=sanitized_topic,
+            do_research=args.do_research,
+            do_generate_outline=args.do_generate_outline,
+            do_generate_article=args.do_generate_article,
+            do_polish_article=args.do_polish_article,
+            remove_duplicate=args.remove_duplicate,
+        )
+        runner.post_run()
+        runner.summary()
+    except Exception as e:
+        logger.exception(f"An error occurred: {str(e)}")
+        raise
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    # global arguments
+    parser.add_argument('--output-dir', type=str, default='./results/deepseek',
+                        help='Directory to store the outputs.')
+    parser.add_argument('--max-thread-num', type=int, default=3,
+                        help='Maximum number of threads to use. The information seeking part and the article generation'
+                             'part can speed up by using multiple threads. Consider reducing it if keep getting '
+                             '"Exceed rate limit" error when calling LM API.')
+    parser.add_argument('--retriever', type=str, choices=['bing', 'you'], required=True,
+                        help='The search engine API to use for retrieving information.')
+    parser.add_argument('--model', type=str, choices=['deepseek-chat', 'deepseek-coder'], default='deepseek-chat',
+                        help='DeepSeek model to use. "deepseek-chat" for general tasks, "deepseek-coder" for coding tasks.')
+    parser.add_argument('--temperature', type=float, default=1.0,
+                        help='Sampling temperature to use.')
+    parser.add_argument('--top_p', type=float, default=0.9,
+                        help='Top-p sampling parameter.')
+    # stage of the pipeline
+    parser.add_argument('--do-research', action='store_true',
+                        help='If True, simulate conversation to research the topic; otherwise, load the results.')
+    parser.add_argument('--do-generate-outline', action='store_true',
+                        help='If True, generate an outline for the topic; otherwise, load the results.')
+    parser.add_argument('--do-generate-article', action='store_true',
+                        help='If True, generate an article for the topic; otherwise, load the results.')
+    parser.add_argument('--do-polish-article', action='store_true',
+                        help='If True, polish the article by adding a summarization section and (optionally) removing '
+                             'duplicate content.')
+    # hyperparameters for the pre-writing stage
+    parser.add_argument('--max-conv-turn', type=int, default=3,
+                        help='Maximum number of questions in conversational question asking.')
+    parser.add_argument('--max-perspective', type=int, default=3,
+                        help='Maximum number of perspectives to consider in perspective-guided question asking.')
+    parser.add_argument('--search-top-k', type=int, default=3,
+                        help='Top k search results to consider for each search query.')
+    # hyperparameters for the writing stage
+    parser.add_argument('--retrieve-top-k', type=int, default=3,
+                        help='Top k collected references for each section title.')
+    parser.add_argument('--remove-duplicate', action='store_true',
+                        help='If True, remove duplicate content from the article.')
+
+    main(parser.parse_args())
diff --git a/knowledge_storm/lm.py b/knowledge_storm/lm.py
@@ -105,6 +105,95 @@ def __call__(
         return completions
 
 
+class DeepSeekModel(dspy.OpenAI):
+    """A wrapper class for DeepSeek API, compatible with dspy.OpenAI."""
+
+    def __init__(
+            self,
+            model: str = "deepseek-chat",
+            api_key: Optional[str] = None,
+            api_base: str = "https://api.deepseek.com",
+            **kwargs
+    ):
+        super().__init__(model=model, api_key=api_key, api_base=api_base, **kwargs)
+        self._token_usage_lock = threading.Lock()
+        self.prompt_tokens = 0
+        self.completion_tokens = 0
+        self.model = model
+        self.api_key = api_key or os.getenv("DEEPSEEK_API_KEY")
+        self.api_base = api_base
+        if not self.api_key:
+            raise ValueError("DeepSeek API key must be provided either as an argument or as an environment variable DEEPSEEK_API_KEY")
+
+    def log_usage(self, response):
+        """Log the total tokens from the DeepSeek API response."""
+        usage_data = response.get('usage')
+        if usage_data:
+            with self._token_usage_lock:
+                self.prompt_tokens += usage_data.get('prompt_tokens', 0)
+                self.completion_tokens += usage_data.get('completion_tokens', 0)
+
+    def get_usage_and_reset(self):
+        """Get the total tokens used and reset the token usage."""
+        usage = {
+            self.model:
+                {'prompt_tokens': self.prompt_tokens, 'completion_tokens': self.completion_tokens}
+        }
+        self.prompt_tokens = 0
+        self.completion_tokens = 0
+        return usage
+
+    @backoff.on_exception(
+        backoff.expo,
+        ERRORS,
+        max_time=1000,
+        on_backoff=backoff_hdlr,
+        giveup=giveup_hdlr,
+    )
+    def _create_completion(self, prompt: str, **kwargs):
+        """Create a completion using the DeepSeek API."""
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        data = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            **kwargs
+        }
+        response = requests.post(f"{self.api_base}/v1/chat/completions", headers=headers, json=data)
+        response.raise_for_status()
+        return response.json()
+
+    def __call__(
+            self,
+            prompt: str,
+            only_completed: bool = True,
+            return_sorted: bool = False,
+            **kwargs,
+    ) -> list[dict[str, Any]]:
+        """Call the DeepSeek API to generate completions."""
+        assert only_completed, "for now"
+        assert return_sorted is False, "for now"
+
+        response = self._create_completion(prompt, **kwargs)
+
+        # Log the token usage from the DeepSeek API response.
+        self.log_usage(response)
+
+        choices = response["choices"]
+        completions = [choice["message"]["content"] for choice in choices]
+
+        history = {
+            "prompt": prompt,
+            "response": response,
+            "kwargs": kwargs,
+        }
+        self.history.append(history)
+
+        return completions
+
+
 class AzureOpenAIModel(dspy.AzureOpenAI):
     """A wrapper class for dspy.AzureOpenAI."""
     def __init__(