package fileparser import ( "os" "path/filepath" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestExtractText_EmptyData(t *testing.T) { result, err := ExtractText(nil, "test.txt") assert.NoError(t, err) assert.Empty(t, result) result, err = ExtractText([]byte{}, "test.txt") assert.NoError(t, err) assert.Empty(t, result) } func TestExtractText_PlainText(t *testing.T) { content := "Тестовый текст для проверки" result, err := ExtractText([]byte(content), "document.txt") assert.NoError(t, err) assert.Equal(t, content, result) } func TestExtractText_PlainTextWithNewlines(t *testing.T) { content := "Первая строка\nВторая строка\nТретья строка" result, err := ExtractText([]byte(content), "document.txt") assert.NoError(t, err) assert.Equal(t, content, result) } func TestExtractText_RealDocxFile(t *testing.T) { testdataPath := filepath.Join("testdata", "test_document.docx") data, err := os.ReadFile(testdataPath) require.NoError(t, err, "не удалось прочитать тестовый файл") result, err := ExtractText(data, "тестовый.docx") assert.NoError(t, err) assert.NotEmpty(t, result, "текст из docx не должен быть пустым") t.Logf("Извлеченный текст из docx:\n%s", result) } func TestExtractText_DocxWithAnyFilename(t *testing.T) { testdataPath := filepath.Join("testdata", "test_document.docx") data, err := os.ReadFile(testdataPath) require.NoError(t, err) result1, err := ExtractText(data, "random_name_without_extension") assert.NoError(t, err) assert.NotEmpty(t, result1) result2, err := ExtractText(data, "document.pdf") assert.NoError(t, err) assert.NotEmpty(t, result2) assert.Equal(t, result1, result2, "результат должен быть одинаковым независимо от имени файла") } func TestExtractText_UnsupportedFormat_PDF(t *testing.T) { pdfHeader := []byte("%PDF-1.4\n") result, err := ExtractText(pdfHeader, "document.pdf") assert.Error(t, err) assert.Empty(t, result) assert.Contains(t, err.Error(), "UNSUPPORTED_FILE_FORMAT") } func TestExtractText_UnsupportedFormat_Image(t *testing.T) { pngHeader := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A} result, err := ExtractText(pngHeader, "image.png") assert.Error(t, err) assert.Empty(t, result) assert.Contains(t, err.Error(), "UNSUPPORTED_FILE_FORMAT") } func TestExtractText_InvalidDocx(t *testing.T) { zipHeader := []byte{0x50, 0x4B, 0x03, 0x04} fakeZip := append(zipHeader, []byte("not a valid zip content")...) result, err := ExtractText(fakeZip, "fake.docx") assert.Error(t, err) assert.Empty(t, result) } func TestIsDocx_ValidDocx(t *testing.T) { testdataPath := filepath.Join("testdata", "test_document.docx") data, err := os.ReadFile(testdataPath) require.NoError(t, err) assert.True(t, isDocx(data)) } func TestIsDocx_RegularZip(t *testing.T) { zipHeader := []byte{0x50, 0x4B, 0x03, 0x04} assert.False(t, isDocx(zipHeader)) } func TestIsDocx_NotZip(t *testing.T) { textData := []byte("plain text content") assert.False(t, isDocx(textData)) }