import { IConversation } from "app/types"
import { TOKEN_ESTIMATE_FACTOR } from "../constants"
import { decode, heEscape } from "utils/sanitize"
import { formatSampledConversations } from "./llmUtils/llmFormatting"

type Mode = "full" | "truncated"

export interface RawDataFormatterParams {
  conversations: IConversation[]
  tokensPerConversation: number
  tokenLimit: number
  mode?: Mode
}

interface RawDataFormatterReturns {
  formattedRawData: string
  formattedRawDataLength: number
  formattedRawDataTokenCount: number
  formattedRawDataTokenCountLargestItem: number
}

export const rawDataFormatter = (
  params: RawDataFormatterParams,
): RawDataFormatterReturns => {
  let formattedRawData = ""
  let formattedRawDataLength = 0
  let formattedRawDataTokenCountLargestItem = 0

  for (let i = 0; i < params.conversations.length; i++) {
    const conversation = params.conversations[i]
    let formattedConversation = formatSampledConversations(conversation)

    // Token count of the unsliced raw version of conversation
    const unslicedTokenCount =
      formattedConversation.length * TOKEN_ESTIMATE_FACTOR

    formattedRawDataTokenCountLargestItem = Math.max(
      formattedRawDataTokenCountLargestItem,
      Math.floor(unslicedTokenCount),
    )

    let slicedConversation: string

    if (params.mode === "full") {
      // In full mode, use the entire conversation without truncation
      slicedConversation = formattedConversation
    } else {
      // In truncated or default mode, slice the conversation based on tokens per conversation
      const maxLength = Math.min(
        Math.floor(params.tokensPerConversation / TOKEN_ESTIMATE_FACTOR),
        formattedConversation.length,
      )

      let sliceLength = maxLength
      while (sliceLength > 0 && formattedConversation[sliceLength] !== " ") {
        sliceLength--
      }
      if (sliceLength === 0) sliceLength = maxLength

      slicedConversation = formattedConversation.slice(0, sliceLength)
    }

    // Check if adding this conversation exceeds the total token limit
    const estimatedLength =
      (formattedRawData.length + slicedConversation.length) *
      TOKEN_ESTIMATE_FACTOR

    if (params.tokenLimit && estimatedLength > params.tokenLimit) {
      // In truncated mode, we still add the conversation but break after
      if (params.mode === "truncated") {
        formattedRawData += decode(
          heEscape(
            `<conversation id="${i}">${slicedConversation}</conversation>`,
          ),
        )
        formattedRawDataLength = i + 1
      }
      break
    }

    formattedRawData += decode(
      heEscape(`<conversation id="${i}">${slicedConversation}</conversation>`),
    )
    formattedRawDataLength = i + 1
  }

  const formattedRawDataTokenCount = Math.floor(
    formattedRawData.length * TOKEN_ESTIMATE_FACTOR,
  )

  return {
    formattedRawData,
    formattedRawDataLength,
    formattedRawDataTokenCount,
    formattedRawDataTokenCountLargestItem,
  }
}
